# Turkish Sentiment Analysis With Multinomial Naive Bayes

# Step 1: import required libs

In [1]:
import nltk
import string
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.metrics import make_scorer, accuracy_score, f1_score, precision_score, recall_score

# Step 2: import  Verius NLP tools

In [2]:
from veriusapigateway import VeriUsAPIGateway
vu = VeriUsAPIGateway("API_KEY")

# Step 3: load the dataset

In [3]:
df = pd.read_csv('./sample_beyazperde_dataset.csv')

# Step 4: drop the "NAN" values from dataset

In [4]:
df.dropna(inplace=True)

# Step 5:  drop "Neutral" labeled data if there exists

In [5]:
df = df[df.target !="Neutral"]

# Step 6:  load Turkish stopwords

In [6]:
with open("stopwords.txt", "r") as sw:
        stops = [s.strip() for s in sw.readlines()]

# Step 7:  inspect the size of the "Positive" and "Negative" labeled data

In [7]:
df.groupby("target").nunique()

Unnamed: 0_level_0,target,text
target,Unnamed: 1_level_1,Unnamed: 2_level_1
Negative,1,637
Positive,1,3138


# Step 8:  to avoid overfitting, take equal size of samples from both classes

In [8]:
#  take 637 samples each of classes
df_pos = df[df.target =="Positive"].head(637)
df_neg = df[df.target =="Negative"].head(637)
df_equ = df_pos.append(df_neg)
len(df_equ)

1274

# Step 9:  shuffle the new dataset

In [9]:
df = shuffle(df_equ)
df.head()

Unnamed: 0,target,text
3542,Negative,"Saçma film,biraz tuhaf bir kovalamaca birazda ..."
582,Positive,Mükemmel ötesi bir film.Hele o tankeri patlatm...
1433,Negative,Ben hayatımda böyle bir film izlemedim. Şahane...
2017,Negative,Helen Mirren'in Oyunculuğu Kötü... Bir İletişi...
3390,Negative,oyuncular çok iyi ama bence bukadarda abartmay...


# Step 10:  drop stopwords, punctuations and lower the sentences

In [10]:
def drop_stopwords(raw_text):
    clean_data = [] 
    for text in raw_text:
        tokens = nltk.word_tokenize(text)
        tokens = [w.lower() for w in tokens]

        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]


        stop_words = set(stops)
        words = [w for w in words if not w in stop_words]
        clean_data.append(" ".join([w for w in words]))
    
    return clean_data

In [11]:
## to avoid SettingWithCopyWarning, copy the original df as dfa and return the df
dfa = df.copy()
dfa["stopwords_dropped"] = drop_stopwords(dfa.text)
df = dfa.copy()
df.head()

Unnamed: 0,target,text,stopwords_dropped
3542,Negative,"Saçma film,biraz tuhaf bir kovalamaca birazda ...",saçma film tuhaf kovalamaca birazda değişik ka...
582,Positive,Mükemmel ötesi bir film.Hele o tankeri patlatm...,mükemmel ötesi filmhele tankeri patlatma sahne...
1433,Negative,Ben hayatımda böyle bir film izlemedim. Şahane...,hayatımda film izlemedim şahane derece gerçekl...
2017,Negative,Helen Mirren'in Oyunculuğu Kötü... Bir İletişi...,helen mirrenin oyunculuğu kötü fakültesi sinem...
3390,Negative,oyuncular çok iyi ama bence bukadarda abartmay...,oyuncular iyi bence bukadarda abartmayın aman ...


# Step 11:  normalize each sentence in the dataset

In [12]:
def normalizer(stopwords_dropped_text):
    normalized = []
    for sentence in stopwords_dropped_text:
        normalized.append(vu.get_normal(sentence))
    
    return normalized

In [13]:
df["normalized"] = normalizer(df.stopwords_dropped)
df.head()

Unnamed: 0,target,text,stopwords_dropped,normalized
3542,Negative,"Saçma film,biraz tuhaf bir kovalamaca birazda ...",saçma film tuhaf kovalamaca birazda değişik ka...,saçma film tuhaf kovalamaca birazda değişik ka...
582,Positive,Mükemmel ötesi bir film.Hele o tankeri patlatm...,mükemmel ötesi filmhele tankeri patlatma sahne...,mükemmel ötesi filmsele tankeri patlatma sahne...
1433,Negative,Ben hayatımda böyle bir film izlemedim. Şahane...,hayatımda film izlemedim şahane derece gerçekl...,hayatımda film izlemedim şahane derece gerçekl...
2017,Negative,Helen Mirren'in Oyunculuğu Kötü... Bir İletişi...,helen mirrenin oyunculuğu kötü fakültesi sinem...,helen midenin oyunculuğu kötü fakültesi sinema...
3390,Negative,oyuncular çok iyi ama bence bukadarda abartmay...,oyuncular iyi bence bukadarda abartmayın aman ...,oyuncular iyi bence Buka'larda abartmayın aman...


# Step 12:  stem each sentence tokens in the dataset

In [14]:
def stemmer(normalized_text):
    stemmed = []
    for sentence in normalized_text:
        stemmed.append(vu.get_stem(sentence))
    
    return stemmed

In [15]:
df["stemmed"] = stemmer(df.normalized)
df.head()

Expecting value: line 1 column 1 (char 0)


Unnamed: 0,target,text,stopwords_dropped,normalized,stemmed
3542,Negative,"Saçma film,biraz tuhaf bir kovalamaca birazda ...",saçma film tuhaf kovalamaca birazda değişik ka...,saçma film tuhaf kovalamaca birazda değişik ka...,saçm film tuhaf kovalama biraz değişik karakte...
582,Positive,Mükemmel ötesi bir film.Hele o tankeri patlatm...,mükemmel ötesi filmhele tankeri patlatma sahne...,mükemmel ötesi filmsele tankeri patlatma sahne...,mükemmel öte filmse tanker patla sahne gül kar...
1433,Negative,Ben hayatımda böyle bir film izlemedim. Şahane...,hayatımda film izlemedim şahane derece gerçekl...,hayatımda film izlemedim şahane derece gerçekl...,hayat film izle şahane derec gerçek uzak senar...
2017,Negative,Helen Mirren'in Oyunculuğu Kötü... Bir İletişi...,helen mirrenin oyunculuğu kötü fakültesi sinem...,helen midenin oyunculuğu kötü fakültesi sinema...,he mide oyun kötü fakülte sine bölüm mezun osc...
3390,Negative,oyuncular çok iyi ama bence bukadarda abartmay...,oyuncular iyi bence bukadarda abartmayın aman ...,oyuncular iyi bence Buka'larda abartmayın aman...,oyun iyi ben Buka lar abart aman aman bir film...


# Step 13:  drop unnecessary columns in the dataset

In [16]:
df = df.drop(['text', 'stopwords_dropped', 'normalized'], axis=1)
df.head()

Unnamed: 0,target,stemmed
3542,Negative,saçm film tuhaf kovalama biraz değişik karakte...
582,Positive,mükemmel öte filmse tanker patla sahne gül kar...
1433,Negative,hayat film izle şahane derec gerçek uzak senar...
2017,Negative,he mide oyun kötü fakülte sine bölüm mezun osc...
3390,Negative,oyun iyi ben Buka lar abart aman aman bir film...


# Step 14:  split train and test data

In [17]:
X_train, X_test, y_train, y_test = train_test_split(df.stemmed, df.target, test_size=0.20, random_state = 42)
## shape of the training data
X_train.shape

(1019,)

# Step 15:  vectorize the dataset using "TfidfVectorizer"

In [18]:
vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1, 2))
vectorizer.fit(X_train)
#  get the trainig and test features by transforming vectorizer
training_features = vectorizer.transform(X_train)    
test_features = vectorizer.transform(X_test)
## shape of the vectorized training data
training_features.shape

(1019, 21496)

# Step 16:  apply "GridSearchCV" method for "MultinomialNB" classifier 

In [19]:
#  create an instance of "MultinomialNB" classifier 
mnb = MultinomialNB()
# give some tuned_parameters in order to find the best alpha hyperparameter
tuned_parameters = {
    'alpha': [1, 1e-1, 1e-2]
}
# create a scorer to compare the parameters
acc_scorer = make_scorer(accuracy_score)
# create an instance of "GridSearchCV" class and give parameters
grid_obj = GridSearchCV(mnb, tuned_parameters, cv=10, scoring=acc_scorer)
grid_obj = grid_obj.fit(training_features, y_train)
# set the model to the best combination of parameters
model = grid_obj.best_estimator_
# fit the best model for the dataset 
model.fit(training_features, y_train)



MultinomialNB(alpha=1, class_prior=None, fit_prior=True)

# Step 17:  predict the test data

In [20]:
y_pred = model.predict(test_features)
y_pred.shape

(255,)

# Step 18:  observe the accuracy score

In [21]:
print("Accuracy_score on the dataset:{:.2f}".format(accuracy_score(y_test, y_pred)))

Accuracy_score on the dataset:0.82


# Step 19:  create a classification_report

In [22]:
target_names = df.target.unique()
print(classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.81      0.84      0.83       132
    Positive       0.82      0.79      0.80       123

   micro avg       0.82      0.82      0.82       255
   macro avg       0.82      0.81      0.82       255
weighted avg       0.82      0.82      0.82       255



# Step 20:  observe the confusion matrix

In [23]:
confusion_matrix(y_test, y_pred)

array([[111,  21],
       [ 26,  97]], dtype=int64)

# Step 21:  observe the prediction probabilities of the wrong predictions

In [24]:
pred_prob = model.predict_proba(test_features)
data = {'neg_ratio': pred_prob[:,0], 'pos_ratio': pred_prob[:,1],'pred': y_pred, 'real': y_test,'stemmed': X_test}
df_pred_prob = pd.DataFrame(data=data)
df_pred_prob = df_pred_prob[df_pred_prob.pred != df_pred_prob.real]
df_pred_prob.head()

Unnamed: 0,neg_ratio,pos_ratio,pred,real,stemmed
4015,0.496332,0.503668,Positive,Negative,biçi bir fil ya adam sade izle dış film bir yo...
989,0.392982,0.607018,Positive,Negative,karanlık film sev sil hoşlan film gerek kasvet...
212,0.529298,0.470702,Negative,Positive,salon sade kere gül se gel diy arkadaş fark fi...
172,0.452754,0.547246,Positive,Negative,garip fil ol katil kon cinayet roman eda karma...
687,0.342994,0.657006,Positive,Negative,kon güzel film benze başla film anlatım berbat...


# Step 22:  apply ten-fold cross validation

In [25]:
#  convert targets to numbers since cross_validate works with numbers
df["target_binary"] = df.target.replace("Positive",1).replace("Negative",0)
scoring_list = ["f1_macro","precision_macro","recall_macro","accuracy"]
scores = cross_validate(model,vectorizer.transform(df.stemmed), df.target_binary, cv=10, scoring=scoring_list)
for scr in scoring_list:
    print(scr+":"+"{:.2f}".format(scores["test_"+scr].mean()))

f1_macro:0.83
precision_macro:0.83
recall_macro:0.83
accuracy:0.83


# Step 23:  create pipeline for the model

In [26]:
model = Pipeline([
    ('vect', vectorizer),
    ('clf', model),
])

# Step 24:  pickle the model

In [None]:
joblib.dump(model,'Turkish_Sentiment_Analysis_With_Multinomial_Naive_Bayes.pkl')