### Importing libraries

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.externals import joblib

In [14]:
data = pd.read_csv("Data/reddit-top-flairs-cleaned.csv")

In [15]:
data.head()

Unnamed: 0,id,url,title,body,flair,dirty_title,dirty_body,num_words_title,num_words_body,num_unique_words_title,num_unique_words_body,num_chars_title,num_chars_body
0,1267x2,https://twitter.com/TheVijayMallya/status/2618...,dat tweet,,Business/Finance,Dat Tweet,.,2,0,2,0,9,0
1,12i7hb,http://www.kickstarter.com/projects/1077963256...,mom started kickstarter show early indian phot...,,Photography,So my mom started a Kickstarter for her show o...,.,9,0,9,0,64,0
2,13ehd8,https://www.reddit.com/r/india/comments/13ehd8...,please consider buying book india 2020 exodus ...,friend cowritten book s excerpt serious descri...,Science/Technology,"Please consider buying our book ""India 2020 --...",A friend and I have co-written this book:\nhtt...,11,77,10,74,71,539
3,17294b,https://www.reddit.com/r/india/comments/17294b...,us double standards,terrorists afghanistan attack usa usa goes mot...,AskIndia,Us double standards,Terrorists in Afghanistan attack USA. USA goes...,3,17,3,12,19,128
4,17x8uw,https://www.reddit.com/r/india/comments/17x8uw...,list india related sub reddits,since removal drop linking related sub reddits...,AskIndia,A list of India Related Sub Reddits,"Since the removal of the drop down we had, lin...",5,264,5,134,30,1855


### Dropping unwanted columns and shuffling the data.

In [16]:
data = data.drop(['dirty_title','dirty_body','num_words_body','num_unique_words_title'
           ,'num_unique_words_body','num_chars_title','num_words_title','num_chars_body','url','id'], axis=1)

data.sample(frac=1)

Unnamed: 0,title,body,flair
441,businessfinance sbi reduces term deposit rate...,,Business/Finance
1774,indiascoronaviruscurvehasspikedupcasescouldrea...,,Coronavirus
83,fast well know science technological side coun...,presume users aware surroundings field technol...,Science/Technology
419,way interact fellow upsc aspirants reddit,quora used goto place civil discussions questi...,Science/Technology
345,mahindra ibm develop blockchain solution suppl...,,Business/Finance
...,...,...,...
811,askindia tips new car driver,bought used car last week m practicing driving...,AskIndia
937,nonpolitical given ban porn sites india reddi...,use alt account instagram reddit get fast nonv...,Non-Political
956,type sports shoe running,recently started running currently using spark...,Sports
1064,’ india olympic medals elite athletes,’ curious question coming american paki whose ...,Sports


In [17]:
data["body"].fillna(".", inplace = True)
data.head()

Unnamed: 0,title,body,flair
0,dat tweet,.,Business/Finance
1,mom started kickstarter show early indian phot...,.,Photography
2,please consider buying book india 2020 exodus ...,friend cowritten book s excerpt serious descri...,Science/Technology
3,us double standards,terrorists afghanistan attack usa usa goes mot...,AskIndia
4,list india related sub reddits,since removal drop linking related sub reddits...,AskIndia


In [18]:
top_flairs = ["Politics", "Non-Political", "Coronavirus", "AskIndia", "Policy/Economy", 
              "Photography", "Business/Finance", "Sports", "Science/Technology"]

### The columns of "title" and "body" are combined as one feature as "input_features" and "cat" stores the output label. 

In [19]:
input_features = data["title"] + " "+ data["body"]
data = data.assign(input_features = input_features)

y = data.flair

In [20]:
data = data.drop(['title', 'body'],axis=1)
data.head()

Unnamed: 0,flair,input_features
0,Business/Finance,dat tweet .
1,Photography,mom started kickstarter show early indian phot...
2,Science/Technology,please consider buying book india 2020 exodus ...
3,AskIndia,us double standards terrorists afghanistan att...
4,AskIndia,list india related sub reddits since removal d...


### Rearranging the columns and writing the final dataset used for models.

In [22]:
cols = data.columns.tolist()
print(cols)
cols = cols[-1:] + cols[:-1]
print(cols)
data = data[cols]

data.to_csv('Data/final-data.csv', index=False)
data.head()

['flair', 'input_features']
['input_features', 'flair']


Unnamed: 0,input_features,flair
0,dat tweet .,Business/Finance
1,mom started kickstarter show early indian phot...,Photography
2,please consider buying book india 2020 exodus ...,Science/Technology
3,us double standards terrorists afghanistan att...,AskIndia
4,list india related sub reddits since removal d...,AskIndia


### Train test split is done.

In [21]:
x_train, x_test, y_train, y_test = train_test_split(input_features,y, test_size=0.3)

print("x_train dim:",x_train.shape, "\ty_train dim:", y_train.shape)
print("x_test dim:",x_test.shape, "\ty_test dim:", y_test.shape)

x_train dim: (1409,) 	y_train dim: (1409,)
x_test dim: (604,) 	y_test dim: (604,)


## Logistic Regression

It is a typical classifier and most pervasively used one too. It has great interpretability properties.
### Parameters:
1. C = inverse regularization parameter; C = 1/λ;  It’s a penalty term meant regulate against overfitting.
2. max_iter = The maximum number of passes over the training data, epochs.

In [8]:
# Logistic Regression
logistic = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('lr', LogisticRegression(C=1000, max_iter=1000))])
logistic.fit(x_train, y_train)

y_pred = logistic.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 82.28%
                    precision    recall  f1-score   support

          Politics       0.81      0.85      0.83        74
     Non-Political       0.78      0.69      0.73        65
       Coronavirus       0.93      0.97      0.95        69
          AskIndia       0.92      0.82      0.87        60
    Policy/Economy       0.92      0.95      0.94        62
       Photography       0.68      0.73      0.70        66
  Business/Finance       0.73      0.80      0.76        70
            Sports       0.77      0.73      0.75        64
Science/Technology       0.90      0.85      0.88        74

          accuracy                           0.82       604
         macro avg       0.83      0.82      0.82       604
      weighted avg       0.83      0.82      0.82       604



### Inference from Logistic Regression:
1. The model has a total accuracy of **83%**, which is good.
2. The most performing flair category was **"Coronavirus"**.
3. The least performing flair categories was **"Non-Political"** 

## NAIVE BAYES CLASSIFIER
One of the most suitable variants for text is the multinomial variant.

In [9]:
# Naive Bayes
naive = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('nb', MultinomialNB())])
naive.fit(x_train, y_train)
y_pred = naive.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 69.04%
                    precision    recall  f1-score   support

          Politics       0.73      0.51      0.60        74
     Non-Political       0.40      0.78      0.53        65
       Coronavirus       0.83      0.97      0.89        69
          AskIndia       0.89      0.57      0.69        60
    Policy/Economy       0.85      0.90      0.88        62
       Photography       0.69      0.50      0.58        66
  Business/Finance       0.72      0.59      0.65        70
            Sports       0.56      0.70      0.62        64
Science/Technology       0.98      0.70      0.82        74

          accuracy                           0.69       604
         macro avg       0.74      0.69      0.70       604
      weighted avg       0.74      0.69      0.70       604



### Inference from Naive Bayes Classifier:
1. The model has a total accuracy of **69%**, which is decent.
2. The most performing flair category was **"Coronavirus"**.
3. The least performing flair category was **"Non-Political"**.

## Random Forest Classifier

Random forest classifier suits most multi-class classification problem, also they have good interpretability and work faster.
### Parameters:
1. n_estimators = the number of trees in the forest.

In [18]:
# Random Forest
random = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('rf', RandomForestClassifier(n_estimators = 600))])
random.fit(x_train, y_train)

y_pred = random.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 82.28%
                    precision    recall  f1-score   support

          Politics       0.86      0.91      0.88        74
     Non-Political       0.82      0.72      0.77        65
       Coronavirus       0.85      0.97      0.91        69
          AskIndia       0.98      0.97      0.97        60
    Policy/Economy       0.95      0.95      0.95        62
       Photography       0.76      0.68      0.72        66
  Business/Finance       0.53      0.74      0.62        70
            Sports       0.84      0.72      0.77        64
Science/Technology       1.00      0.76      0.86        74

          accuracy                           0.82       604
         macro avg       0.84      0.82      0.83       604
      weighted avg       0.84      0.82      0.83       604



### Inference from Random Forest Classifier:
1. The model has a total accuracy of **82%**, which is goood.
2. The most performing flair category was **"AskIndia"**.
3. The least performing flair category was **"Business/Finance"**.

In [19]:
joblib.dump(random, 'Pickle_files/random-forest.pkl')

['Pickle_files/random-forest.pkl']

## k- Nearest Neighbours Classifier
### Parameters:
1. n_neighbours = number of neighbours to take into consideration.

In [34]:
# k-Nearest Neighbours
neighbours = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('knn', KNeighborsClassifier(n_neighbors=10))])
neighbours.fit(x_train, y_train)
y_pred = neighbours.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 62.58%
                    precision    recall  f1-score   support

          Politics       0.70      0.43      0.53        74
     Non-Political       0.49      0.65      0.56        65
       Coronavirus       0.70      0.94      0.80        69
          AskIndia       0.68      0.63      0.66        60
    Policy/Economy       0.73      0.87      0.79        62
       Photography       0.44      0.53      0.48        66
  Business/Finance       0.59      0.41      0.49        70
            Sports       0.51      0.52      0.51        64
Science/Technology       0.89      0.68      0.77        74

          accuracy                           0.63       604
         macro avg       0.64      0.63      0.62       604
      weighted avg       0.64      0.63      0.62       604



### Inference from k- Nearest Neighbours Classifier:
1. The model has a total accuracy of **63%**, which is goood.
2. The most performing flair category was **"Coronavirus"**.
3. The least performing flair category was **"Photography"**.

## Linear Support Vector Machine
### Parameters:
1. loss = 'hinge' : The loss function to be used.‘hinge’ gives a linear SVM.
2. penalyty = regularisation term. 'l2' for linear SVM.
2. alpha = Constant that multiples the regularisation term.
3. max_iter = The maximum number of passes over the training data, epochs.

In [41]:
svm = Pipeline([('cv', CountVectorizer()),('tfidf', TfidfTransformer()),('svm', SGDClassifier(loss='hinge', 
                penalty='l2',alpha=0.001, max_iter=20))])
svm.fit(x_train, y_train)
y_pred = svm.predict(x_test)

accuracy = (accuracy_score(y_pred, y_test)*100)
print("Accuracy: %.2f" % accuracy +'%')
print(classification_report(y_test, y_pred,target_names=top_flairs))

Accuracy: 83.77%
                    precision    recall  f1-score   support

          Politics       0.86      0.91      0.88        74
     Non-Political       0.77      0.71      0.74        65
       Coronavirus       0.83      0.97      0.89        69
          AskIndia       0.93      0.90      0.92        60
    Policy/Economy       0.91      0.98      0.95        62
       Photography       0.74      0.73      0.73        66
  Business/Finance       0.77      0.73      0.75        70
            Sports       0.80      0.77      0.78        64
Science/Technology       0.93      0.85      0.89        74

          accuracy                           0.84       604
         macro avg       0.84      0.84      0.84       604
      weighted avg       0.84      0.84      0.84       604



### Inference from Linear SVM Classifier:
1. The model has a total accuracy of **84%**, which is gooood.
2. The most performing flair category was **"Policy/Economy"**.
3. The least performing flair category was **"Photography"**.

In [42]:
joblib.dump(random, 'Pickle_files/linear-svm.pkl')

['Pickle_files/linear-svm.pkl']

# OUT OF THE 5 MODELS USED, LINEAR SVM PERFORMED THE BEST WITH A TOTAL AN ACCURACY OF 84%