#### TRAINING DIFFERENT ML MODELS
<b> Models chosen :</b>
<ul>
    <li>Multinomial Naive Bayes Classifier
    <li>Random Forest Classifier
    <li>Support Vector Classifier
    <li>MLP Classifier
</ul>

In [115]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.pipeline import Pipeline
import pickle

#### LOADING PREPROCESSED DATASET
<b>DATA : </b>
<ul>
    <li>Flair
    <li>Post title
</ul>

In [2]:
#### LOADING OUR PREPROCESSED DATASET ####
data = pd.read_csv('datasets/process_reddit_data_stemmed.csv')
labels = data['Flair']
features = data['Post']

In [3]:
data.head()

Unnamed: 0,Flair,Post
0,Politics,pit commun polit parti fuck stupid.
1,Politics,new polit parti gave full front page ad popula...
2,Politics,aap woeful respons delhi commun violenc reflec...
3,Politics,"soft hindutva, left revolution, kejriw establi..."
4,Politics,comic current polit scenario


#### THE FOLLOWING CODE CELL CONVERTS LABELS TO NUMERIC VALUES
<b>Returns : </b> 
    <ul>
        <li>transformed labels
        <li>label_2_id dictionary

In [4]:
#### TRANSFORMING LABELS TO NUMBERS #####
def label_transformed(labels):
    label_dict = dict()
    count = 0
    for i in labels:
        if(i not in label_dict.keys()):
            label_dict[i] =count
            count+=1

    transformed_labels = [label_dict[i] for i in labels]

    return transformed_labels, label_dict

In [5]:
label_tf, label_dict = label_transformed(labels)

In [6]:
print(label_dict)

{'Politics': 0, 'Coronavirus': 1, 'AskIndia': 2, 'Non-Political': 3, 'Policy/Economy': 4, 'Scheduled': 5, 'Business/Finance': 6, 'Science/Technology': 7, 'Food': 8, 'Photography': 9}


#### SPLITTING DATASET INTO TRAIN AND TEST SETS

In [99]:
#### SPLITTING DATASET #####
x_train, x_test, y_train, y_test = train_test_split(features, label_tf, test_size= 0.18, shuffle= True)

In [26]:
print(' training data has {} samples'.format(len(x_train)))
print(' validation data has {} samples'.format(len(x_test)))

 training data has 1170 samples
 validation data has 257 samples


#### A SCORING FUNCTION 
<b>Input : </b>
<ul>
    <li> test labels
    <li> predicted labels
</ul>

<b>Prints : </b>
<ul>
    <li>f1_score : <ol><li>It is really good metric while performing multiclass classification.
                   weighted average of precision and recall. <li>macro averaging was used to consider each class into account.</ol></li>
    <li>accuracy
    <li>precision
    <li>recall
</ul>

In [27]:
##### SCORING FUNCTION ####
def scoring(y_test, y_predict):
    acc = metrics.accuracy_score(y_test, y_predict)
    f1_score = metrics.f1_score(y_test, y_predict, average='macro')
    precision = metrics.precision_score(y_test,y_predict,average='macro')
    recall = metrics.recall_score(y_test,y_predict,average='macro')
    print('the classifier has\n f1_score: {}\n accuracy: {}\n precision:{}\n recall: {}'.format(f1_score,acc,precision,recall))

#### USES MULTINOMIAL NAIVE BAYES CLASSIFIER FOR PREDICTING LABELS

In [112]:
### MULTINOMIAL NAIVE BAYES #####
M_nb = Pipeline([('tfidf',TfidfVectorizer(lowercase=True, min_df=2, ngram_range=(1,2),stop_words='english')),
                 ('clf',MultinomialNB(alpha=0.5))])

M_nb.fit(x_train,y_train)
y_predicted_nb = M_nb.predict(x_test)
scoring(y_test,y_predicted_nb)
pkl_file = 'ML_models/M_nb_model.pkl'
with open(pkl_file,'wb') as f:
    pickle.dump(M_nb,f)

the classifier has
 f1_score: 0.6574231625858569
 accuracy: 0.6536964980544747
 precision:0.6758762046673813
 recall: 0.66807696007696


#### USES RANDOM FOREST CLASSIFIER FOR PREDICTING LABELS

In [113]:
### RANDOM FOREST PIPELINE #####

rf = Pipeline([('tfidf',TfidfVectorizer(lowercase=True, min_df=2, ngram_range=(1,2),stop_words='english')),
                 ('clf',RandomForestClassifier(n_estimators=200))])
rf.fit(x_train,y_train)
y_predicted_rf = rf.predict(x_test)
scoring(y_test,y_predicted_rf)
pkl_file = 'ML_models/rf_model.pkl'
with open(pkl_file,'wb') as f:
    pickle.dump(rf,f)

the classifier has
 f1_score: 0.6758861327728923
 accuracy: 0.688715953307393
 precision:0.6801246207027141
 recall: 0.6886589706589706


#### USES SUPPORT VECTOR CLASSIFIER FOR PREDICTING LABELS

In [114]:
#### SVM PIPELINE #####

svc = Pipeline([('tfidf',TfidfVectorizer(lowercase=True, min_df=2, ngram_range=(1,2),stop_words='english')),
                 ('clf',SVC())])
svc.fit(x_train,y_train)
y_predicted_svc = svc.predict(x_test)
scoring(y_test,y_predicted_svc)
pkl_file = 'ML_models/svc_model.pkl'
with open(pkl_file,'wb') as f:
    pickle.dump(svc,f)

the classifier has
 f1_score: 0.6876937240129652
 accuracy: 0.6848249027237354
 precision:0.6965751988483386
 recall: 0.687990860990861


#### USES MULTI LAYER PERCEPTRON CLASSIFIER FOR PREDICTING LABELS

In [160]:
##### MLP CLASSIFIER #####
mlp = Pipeline([('tfidf',TfidfVectorizer(lowercase=True, min_df=2, ngram_range=(1,2),stop_words='english')),('mlp',MLPClassifier(hidden_layer_sizes=(64,32),alpha=0.1,random_state=0,max_iter=500))])
mlp.fit(x_train,y_train)
y_predicted_mlp = mlp.predict(x_test)
scoring(y_test,y_predicted_mlp)
pkl_file = 'ML_models/mlp_model.pkl'
with open(pkl_file,'wb') as f:
    pickle.dump(mlp,f)

the classifier has
 f1_score: 0.6561505507796792
 accuracy: 0.6536964980544747
 precision:0.6624703925350843
 recall: 0.6565526695526696
