## Building a machine learning text classifier

In [None]:
!pip install pandas seaborn nltk scikit-learn==1.0.2 transformers tensorflow emoji torch

In [None]:
import numpy as np
import pandas as pd

import nltk
import nltk.corpus
import sklearn
import sklearn.pipeline
import sklearn.feature_extraction.text
import sklearn.naive_bayes
import sklearn.model_selection
import sklearn.metrics
import tensorflow
import transformers

## Loading and exploring data

In [None]:
data = pd.read_excel("comm106e_happysad.xlsx")

In [None]:
data.head()

In [None]:
data['label'].value_counts()

In [None]:
data['label'] = data['label'].str.lower()

In [None]:
data['label'].value_counts()

## Training the model pipeline

In [None]:
pipeline = sklearn.pipeline.Pipeline([
    ('vect', sklearn.feature_extraction.text.CountVectorizer()),
    ('tfidf', sklearn.feature_extraction.text.TfidfTransformer()),
    ('clf', sklearn.naive_bayes.MultinomialNB()),
])


In [None]:
pipeline.fit(data['text'], data['label'])

## Scoring some text

In [None]:
emails_to_score = ['I need this report by 9am or else I\'ll be mad!',
                   'I love it, I\'m so happy!',
                   'our next conference should be in France',
                   'our next conference should be in Germany'
                  ]

prediction = pipeline.predict_proba(emails_to_score)

In [None]:
prediction

# Re-training with only 80% of the texts, leaving the other 20% for testing

### Split the data into training and testing sets


In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(data['text'], 
                                                                            data['label'], 
                                                                            test_size=0.2, 
                                                                            random_state=42)

### Train the model by fitting the pipeline to the 80% training data

In [None]:
pipeline.fit(X_train, y_train)

### Use the pipeline to predict the labels of the 20% testing data that was not used to train it


In [None]:
y_pred = pipeline.predict(X_test)

In [None]:
y_pred

### Score the accuracy of the model with the results of the predictions made from the 20% testing data

In [None]:
pipeline.score(X_test, y_test)

### Print a classification report for how the 20% test data did, showing various statistics

In [None]:
print(sklearn.metrics.classification_report(y_test, y_pred))

# Some definitions

But please look at the chart at https://en.wikipedia.org/wiki/Precision_and_recall#Definition_(classification_context)

## Accuracy:

$\frac{\mbox{number of correct predictions}}{\mbox{total number of items predicted}}$

## Precision or positive predictive value (PPV): 

$\frac{\mbox{number of true positives for a label}}{\mbox{number of true + false positives for that label}}$

## True positive rate (TPR) or recall, sensitivity, hit rate:

$\frac{\mbox{number of true positives for a label}}{\mbox{number of actual positives for that label}}$

## F1 score (harmonic mean between precision/PPV and recall/TPR): 

$2*\frac{\mbox{precision * recall}}{\mbox{precision + recall}}$

## Support: Number of cases in each label or all labels

# Running our classification report

In [None]:
print(sklearn.metrics.classification_report(y_test, y_pred))

In [None]:
sklearn.metrics.precision_score(y_test, y_pred, pos_label='happy')

# Auditing the model for variations in locations

In [None]:
countries = pd.read_csv("countries.csv")

In [None]:
countries

In [None]:
results_list = []
for country_name in countries['country']:

    sample_text = "It is so nice in " + country_name + ", I love it there!"
    probability = pipeline.predict_proba([sample_text])[0][0]
    
    result = {'country':country_name,
              'happy_prediction':probability}
    
    results_list.append(result)
    
    

In [None]:
country_audit_data = pd.DataFrame(results_list)
country_audit_data

In [None]:
country_audit_data.sort_values('happy_prediction')