In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

#### Load checkpoint

In [23]:
df= pd.read_csv('tfidf_df.csv')

#### Import data

In [24]:
df_1 = pd.read_csv('x_city_df.csv')
df_0 = pd.read_csv('x_city_0_df.csv')
city_names_df = pd.read_csv('city_variations.csv')

In [25]:
cities = set(list(city_names_df['eng'])+list(city_names_df['wiki'])+list(city_names_df['local']))
cities = cities.union(set([word.lower() for word in cities]))
counter = 0

total = len(df_1)
results = {}
for i, comment in enumerate(df_1['comment']):    
    for j, city in enumerate(cities):
        if city in comment:
            counter +=1
    results[i] = counter
    counter = 0
            

In [26]:
key_list = list(results.keys())
val_list = list(results.values())
scratch_df  = pd.DataFrame()
scratch_df['key'] = key_list
scratch_df['val'] = val_list

There is at least one city name in every comment from the untranslated x_city_1 data, but 600+ comments in the english version return no city name occurrance. We'll run the TF-IDF classifier on the comments first and then on the English and try to find a way to deal with the translation issue

In [None]:
#TO DO: check if comments have been lemmatized. I think they came from unflattened unlemmatized folder

#### Combine 0 and 1 responses

In [79]:
y_0 = [0 for i in range(len(df_0))]
y_1 = [1 for i in range(len(df_1))]
df_0['Y'] = y_0
df_1['Y'] = y_1
df = df_0.append(df_1)
cols = ['comment','language','english','Y']
df = df[cols]

  df = df_0.append(df_1)


#### Tokenization of 'english'

In [28]:
import string

In [80]:
def tokenize(stem):

    stem = stem.replace('.',' ')

    data_into_list = stem.split(' ')
    data_into_list = [value for value in data_into_list if value != '']
    words = []
    for word in data_into_list:
    
        #remove numbers
        result = ''.join([j for j in word if not j.isdigit()])
        #remove rest of punctuation
        new_string = result.translate(str.maketrans('', '', string.punctuation))
        if 'begindocument' in new_string:
            new_string = new_string.replace('begindocument', '')
        if new_string != '':
            words.append(new_string)
            
    return words


In [81]:
tokenized_english = []
for i in df['english']:
    tokenized = tokenize(i)
    tokenized_english.append(tokenized)
    
df['tokenized_english'] = tokenized_english

#### Stop-word removal

In [82]:
def remove_stopwords(word_tokens):
    
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize

    stop_words = set(stopwords.words('english'))
    
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

    return filtered_sentence

In [83]:
no_stopwords = []
for i in df['tokenized_english']:
    no_stopwords.append(remove_stopwords(i))
df['eng_tokenized_nosw'] = no_stopwords

#### Stemming

In [86]:
from gensim.parsing.porter import PorterStemmer
porter_stemmer = PorterStemmer()
df['stemmed_english'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['eng_tokenized_nosw']]

#### Convert list back to string (for tf-idf function)¶

In [46]:
def token_to_string(df, col):
    string = []
    for i in df[col]:
        comment = ''
        for word in i:
            comment = comment + word + ' '
        string.append(comment)

    return string

In [88]:
#save checkpoint
df.to_csv('tfidf_df.csv')

In [59]:
#load checkpoint
df = pd.read_csv('tfidf_df.csv')
token_cols = ['tokenized_english', 'comment', 'eng_tokenized_nosw', 'stemmed_english']
for i in token_cols:
    df[i] = df[i].apply(lambda x: x.replace('[', '').replace(']', '').replace('\'', '').split(', '))

#### Default feature and labels

In [81]:
samp_df = df[~df['eng_clean'].isna()].sample(frac=0.02, random_state=43)
Y = samp_df['Y']
#drop 'eng_clean' nan values that were 
corpus = [i for i in samp_df['eng_clean']]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
#X.todense() prints the vectorized tf-idf matrix

tfidf_data = [X.toarray(), Y.to_numpy()]

#train/test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_data[0], tfidf_data[1], test_size=0.33, random_state=72)

#### Naive Bayes (eng_clean)

In [74]:
#fit model
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

In [43]:
y_pred = naive_bayes_classifier.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['Positive', 'Negative']))
print("Confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    Positive       0.90      0.61      0.72       631
    Negative       0.71      0.93      0.81       647

    accuracy                           0.77      1278
   macro avg       0.80      0.77      0.76      1278
weighted avg       0.80      0.77      0.76      1278

Confusion matrix:
[[382 249]
 [ 43 604]]


#### Naive Bayes (comment)

In [63]:
df['comment_clean'] = token_to_string(df, 'comment')

In [64]:
samp_df = df[~df['comment_clean'].isna()].sample(frac=0.02, random_state=3)
Y = samp_df['Y']
#drop 'eng_clean' nan values that were 
corpus = [i for i in samp_df['comment_clean']]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
#X.todense() prints the vectorized tf-idf matrix

tfidf_data = [X.toarray(), Y.to_numpy()]

#train/test split
X_train, X_test, y_train, y_test = train_test_split(tfidf_data[0], tfidf_data[1], test_size=0.33, random_state=72)

#fit model
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(X_train, y_train)

In [65]:
y_pred = naive_bayes_classifier.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['Positive', 'Negative']))
print("Confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    Positive       0.87      0.45      0.60       642
    Negative       0.63      0.93      0.75       638

    accuracy                           0.69      1280
   macro avg       0.75      0.69      0.67      1280
weighted avg       0.75      0.69      0.67      1280

Confusion matrix:
[[291 351]
 [ 44 594]]


#### Other classifiers

In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

In [82]:
logreg_clf = LogisticRegression()
logreg_clf.fit(X_train, y_train)

In [83]:
y_pred = logreg_clf.predict(X_test)
print(metrics.classification_report(y_test, y_pred, target_names=['Positive', 'Negative']))
print("Confusion matrix:")
print(metrics.confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    Positive       0.83      0.91      0.87       627
    Negative       0.91      0.82      0.86       651

    accuracy                           0.87      1278
   macro avg       0.87      0.87      0.87      1278
weighted avg       0.87      0.87      0.87      1278

Confusion matrix:
[[573  54]
 [114 537]]


#### k-fold cross-validation

In [None]:
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score
%matplotlib inline

In [None]:
# K-Fold Cross-Validation
def cross_validation(model, _X, _y, _cv=5):
      '''Function to perform 5 Folds Cross-Validation
       Parameters
       ----------
      model: Python Class, default=None
              This is the machine learning algorithm to be used for training.
      _X: array
           This is the matrix of features.
      _y: array
           This is the target variable.
      _cv: int, default=5
          Determines the number of folds for cross-validation.
       Returns
       -------
       The function returns a dictionary containing the metrics 'accuracy', 'precision',
       'recall', 'f1' for both training set and validation set.
      '''
      _scoring = ['accuracy', 'precision', 'recall', 'f1']
      results = cross_validate(estimator=model,
                               X=_X,
                               y=_y,
                               cv=_cv,
                               scoring=_scoring,
                               return_train_score=True)
      
      return {"Training Accuracy scores": results['train_accuracy'],
              "Mean Training Accuracy": results['train_accuracy'].mean()*100,
              "Training Precision scores": results['train_precision'],
              "Mean Training Precision": results['train_precision'].mean(),
              "Training Recall scores": results['train_recall'],
              "Mean Training Recall": results['train_recall'].mean(),
              "Training F1 scores": results['train_f1'],
              "Mean Training F1 Score": results['train_f1'].mean(),
              "Validation Accuracy scores": results['test_accuracy'],
              "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
              "Validation Precision scores": results['test_precision'],
              "Mean Validation Precision": results['test_precision'].mean(),
              "Validation Recall scores": results['test_recall'],
              "Mean Validation Recall": results['test_recall'].mean(),
              "Validation F1 scores": results['test_f1'],
              "Mean Validation F1 Score": results['test_f1'].mean()
              }

In [None]:
# Grouped Bar Chart for both training and validation data
def plot_result(x_label, y_label, plot_title, train_data, val_data):
        '''Function to plot a grouped bar chart showing the training and validation
          results of the ML model in each fold after applying K-fold cross-validation.
         Parameters
         ----------
         x_label: str, 
            Name of the algorithm used for training e.g 'Decision Tree'
          
         y_label: str, 
            Name of metric being visualized e.g 'Accuracy'
         plot_title: str, 
            This is the title of the plot e.g 'Accuracy Plot'
         
         train_result: list, array
            This is the list containing either training precision, accuracy, or f1 score.
        
         val_result: list, array
            This is the list containing either validation precision, accuracy, or f1 score.
         Returns
         -------
         The function returns a Grouped Barchart showing the training and validation result
         in each fold.
        '''
        
        # Set size of plot
        plt.figure(figsize=(12,6))
        labels = ["1st Fold", "2nd Fold", "3rd Fold", "4th Fold", "5th Fold"]
        X_axis = np.arange(len(labels))
        ax = plt.gca()
        plt.ylim(0.40000, 1)
        plt.bar(X_axis-0.2, train_data, 0.4, color='blue', label='Training')
        plt.bar(X_axis+0.2, val_data, 0.4, color='red', label='Validation')
        plt.title(plot_title, fontsize=30)
        plt.xticks(X_axis, labels)
        plt.xlabel(x_label, fontsize=14)
        plt.ylabel(y_label, fontsize=14)
        plt.legend()
        plt.grid(True)
        plt.show()

In [None]:
NB_result = cross_validation(MultinomialNB(), X, Y)
NB_result

In [None]:
# Plot Accuracy Result
model_name = "NB"
plot_result(model_name,
            "Accuracy",
            "Accuracy scores in 5 Folds",
            NB_result["Training Accuracy scores"],
            NB_result["Validation Accuracy scores"])

#### Iterate through random states and validate results

In [None]:
for i in range(10):
    samp_df = df[~df['eng_clean'].isna()].sample(frac=0.10, random_state=i)
    Y = samp_df['Y']
    corpus = [i for i in samp_df['eng_clean']]
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(corpus)
    #train/test split
    X_train, X_test, y_train, y_test = train_test_split(np.asarray(X.todense()), Y, test_size=0.33, random_state=72)
    #fit model
    naive_bayes_classifier = MultinomialNB()
    naive_bayes_classifier.fit(X_train, y_train)
    y_pred = naive_bayes_classifier.predict(X_test)

    print(metrics.classification_report(y_test, y_pred, target_names=['Positive', 'Negative']))

    NB_result = cross_validation(MultinomialNB(), X, Y)
    print(NB_result)

#### Convert tfidf x into two-dimensions for visualization

In [None]:
# Import required libraries for performance metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate

# Define dictionary with performance metrics
scoring = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score)}

# Import required libraries for machine learning classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Instantiate the machine learning classifiers
log_model = LogisticRegression(max_iter=10000)
svc_model = LinearSVC(dual=False)
dtr_model = DecisionTreeClassifier()
rfc_model = RandomForestClassifier()
gnb_model = GaussianNB()

# Define the models evaluation function
def models_evaluation(X, y, folds):
    
    '''
    X : data set features
    y : data set target
    folds : number of cross-validation folds
    
    '''
    
    # Perform cross-validation to each machine learning classifier
    log = cross_validate(log_model, X, y, cv=folds, scoring=scoring)
    svc = cross_validate(svc_model, X, y, cv=folds, scoring=scoring)
    dtr = cross_validate(dtr_model, X, y, cv=folds, scoring=scoring)
    rfc = cross_validate(rfc_model, X, y, cv=folds, scoring=scoring)
    gnb = cross_validate(gnb_model, X, y, cv=folds, scoring=scoring)

    # Create a data frame with the models perfoamnce metrics scores
    models_scores_table = pd.DataFrame({'Logistic Regression':[log['test_accuracy'].mean(),
                                                               log['test_precision'].mean(),
                                                               log['test_recall'].mean(),
                                                               log['test_f1_score'].mean()],
                                       
                                      'Support Vector Classifier':[svc['test_accuracy'].mean(),
                                                                   svc['test_precision'].mean(),
                                                                   svc['test_recall'].mean(),
                                                                   svc['test_f1_score'].mean()],
                                       
                                      'Decision Tree':[dtr['test_accuracy'].mean(),
                                                       dtr['test_precision'].mean(),
                                                       dtr['test_recall'].mean(),
                                                       dtr['test_f1_score'].mean()],
                                       
                                      'Random Forest':[rfc['test_accuracy'].mean(),
                                                       rfc['test_precision'].mean(),
                                                       rfc['test_recall'].mean(),
                                                       rfc['test_f1_score'].mean()],
                                       
                                      'Gaussian Naive Bayes':[gnb['test_accuracy'].mean(),
                                                              gnb['test_precision'].mean(),
                                                              gnb['test_recall'].mean(),
                                                              gnb['test_f1_score'].mean()]},
                                      
                                      index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])
    
    # Add 'Best Score' column
    models_scores_table['Best Score'] = models_scores_table.idxmax(axis=1)
    
    # Return models performance metrics scores data frame
    return(models_scores_table)
  
# Run models_evaluation function
models_evaluation(X.toarray(), Y, 5)

[array([[-0.09301522, -0.09411812],
        [-0.09629439, -0.09509063],
        [-0.09395881, -0.09234162],
        ...,
        [-0.09758273, -0.02738785],
        [-0.09606031,  0.00726774],
        [-0.09119552, -0.08364914]]),
 array([0, 0, 0, ..., 1, 0, 0], dtype=int64)]