# Predicting the reddit post flair

Source: https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f

Importing necessary libraries

In [101]:
# Data Manipulation 
import numpy as np 
import pandas as pd

# Data Visualisation 
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical libraries 
from sklearn.feature_selection import chi2

# Natural Language Processing
import nltk 
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# Machine Learning 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# Performance Evaluation and Support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import metrics


In [2]:
# Load the data into the dataframe 
data = pd.read_csv('data.csv')
data.shape

(1650, 11)

In [3]:
# Initial look at the data
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Score,ID,URL,num_comments,created_on,Body,Original,Flair,Comments
0,0,"Lost my Job, Sick Mother and Paralysed Dad, In...",1046,g014wc,https://www.reddit.com/r/india/comments/g014wc...,134,1586742000.0,Hi....It's really tough time for everyone. I r...,False,AskIndia,"Learn Python, then Django. Php might take a lo..."
1,1,Why does the government come with a begging bo...,645,fxofyu,https://www.reddit.com/r/india/comments/fxofyu...,204,1586448000.0,"We have floods, terrorist attacks, famines due...",False,AskIndia,[removed]
2,2,Mother's condition is going worse due to hepat...,764,g0zlly,https://www.reddit.com/r/india/comments/g0zlly...,94,1586871000.0,"Hi folks, I really appreciate the warm respons...",False,AskIndia,Can I get some updates and verification on thi...
3,3,People stuck with their family during the lock...,157,g4lrhm,https://www.reddit.com/r/india/comments/g4lrhm...,117,1587384000.0,I don't think we've spend so much time with fa...,False,AskIndia,>patriarchal father who could care less etc \...
4,4,Men who are 30+ and have decided not to get ma...,267,fvy95j,https://www.reddit.com/r/india/comments/fvy95j...,206,1586207000.0,The corona virus has given me some time to thi...,False,AskIndia,Get married. Indians are genetically engineere...


In [4]:
# Printing the data info to have a look at the null values and data types
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650 entries, 0 to 1649
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1650 non-null   int64  
 1   Title         1650 non-null   object 
 2   Score         1650 non-null   int64  
 3   ID            1650 non-null   object 
 4   URL           1650 non-null   object 
 5   num_comments  1650 non-null   int64  
 6   created_on    1650 non-null   float64
 7   Body          635 non-null    object 
 8   Original      1650 non-null   bool   
 9   Flair         1650 non-null   object 
 10  Comments      1557 non-null   object 
dtypes: bool(1), float64(1), int64(3), object(6)
memory usage: 130.6+ KB


This dataset does not have any null values for the flairs. There are null values only in Comments and Body. 

In [5]:
# Printing the info will show 987 not null rows
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650 entries, 0 to 1649
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1650 non-null   int64  
 1   Title         1650 non-null   object 
 2   Score         1650 non-null   int64  
 3   ID            1650 non-null   object 
 4   URL           1650 non-null   object 
 5   num_comments  1650 non-null   int64  
 6   created_on    1650 non-null   float64
 7   Body          635 non-null    object 
 8   Original      1650 non-null   bool   
 9   Flair         1650 non-null   object 
 10  Comments      1557 non-null   object 
dtypes: bool(1), float64(1), int64(3), object(6)
memory usage: 130.6+ KB


In [6]:
# Making a copy of the data for later use
data_og = data.copy()        

In [7]:
# List of relevant features (MOVE LATER)
features = ['Flair', 'URL', 'Title', 'Comments', 'Body']

Our first task is creating a set of labels and assigning them numbers for each unique flair. So, we can first extract the flair data and then assign them integer numbers. This will also include alloting repetitive and similar flairs same labels. We will be adding a new column `id` which is not equivalent to the original ID, hence I am calling it id.  

In [8]:
# Collecting the flair and ids
data = data[features]
data

Unnamed: 0,Flair,URL,Title,Comments,Body
0,AskIndia,https://www.reddit.com/r/india/comments/g014wc...,"Lost my Job, Sick Mother and Paralysed Dad, In...","Learn Python, then Django. Php might take a lo...",Hi....It's really tough time for everyone. I r...
1,AskIndia,https://www.reddit.com/r/india/comments/fxofyu...,Why does the government come with a begging bo...,[removed],"We have floods, terrorist attacks, famines due..."
2,AskIndia,https://www.reddit.com/r/india/comments/g0zlly...,Mother's condition is going worse due to hepat...,Can I get some updates and verification on thi...,"Hi folks, I really appreciate the warm respons..."
3,AskIndia,https://www.reddit.com/r/india/comments/g4lrhm...,People stuck with their family during the lock...,>patriarchal father who could care less etc \...,I don't think we've spend so much time with fa...
4,AskIndia,https://www.reddit.com/r/india/comments/fvy95j...,Men who are 30+ and have decided not to get ma...,Get married. Indians are genetically engineere...,The corona virus has given me some time to thi...
...,...,...,...,...,...
1645,AMA,https://www.reddit.com/r/india/comments/2oytx7...,IAMA person suffering from Bipola[r] Disorder....,Is your zodiac sign Gemini?,Hi all. I alternate between feeling like Einst...
1646,AMA,https://www.reddit.com/r/india/comments/4bmka5...,"Identity, policy, and privacy",[deleted],Hi Reddit community! It’s a pleasure to be her...
1647,AMA,https://www.reddit.com/r/india/comments/3rhufx...,"Hi /r/India, I am cartoonist Sumit Kumar autho...",late to the party and i have no questions for ...,Edit : Going to sleep now. Big dhanyawaad for ...
1648,AMA,https://www.reddit.com/r/india/comments/4yc03a...,"Hi Reddit, this is XUlrike from Janwaar Castle...",[Your organization’s logo](https://janwaar-cas...,The purpose of the Janwaar Castle Community Or...


In [9]:
# Assigning and individual id to each flair
data['id'] = data['Flair'].factorize()[0]
flair_category = data[['Flair', 'id']].drop_duplicates().sort_values('id')
flair_category

Unnamed: 0,Flair,id
0,AskIndia,0
150,Non-Political,1
300,[R]eddiquette,2
450,Photography,3
600,Science/Technology,4
750,Politics,5
900,Business/Finance,6
1050,Policy/Economy,7
1200,Sports,8
1350,Food,9


First, we need to print individual strings to make sure that we are comparing the strings accurately. This means that some strings may have a space here and there and we need to get rid of that or take that into account. I could have used str.contains as well but I prefer this method for the sake of accuracy. 

In [10]:
# Convert into a label dctionary to be used as a means of assigning labels after the prediction
category_labels = dict(flair_category.values)
print(category_labels)

{'AskIndia': 0, 'Non-Political': 1, '[R]eddiquette': 2, 'Photography': 3, 'Science/Technology': 4, 'Politics': 5, 'Business/Finance': 6, 'Policy/Economy': 7, 'Sports': 8, 'Food': 9, 'AMA': 10}


In [11]:
# Similarly, we can create an inverse of the previouus one to convert labels to categories
category_reverse = dict(flair_category[['id', 'Flair']].values)
print(category_reverse)

{0: 'AskIndia', 1: 'Non-Political', 2: '[R]eddiquette', 3: 'Photography', 4: 'Science/Technology', 5: 'Politics', 6: 'Business/Finance', 7: 'Policy/Economy', 8: 'Sports', 9: 'Food', 10: 'AMA'}


Have a look at the data now. We have an id column which are basically the labels that we have to predict. They are derived from equivalent flair categories. We will be using the other columns as our input features. We will also create a series of all labels that need to predicted. 

In [12]:
labels = data['id']
data.head(10)

Unnamed: 0,Flair,URL,Title,Comments,Body,id
0,AskIndia,https://www.reddit.com/r/india/comments/g014wc...,"Lost my Job, Sick Mother and Paralysed Dad, In...","Learn Python, then Django. Php might take a lo...",Hi....It's really tough time for everyone. I r...,0
1,AskIndia,https://www.reddit.com/r/india/comments/fxofyu...,Why does the government come with a begging bo...,[removed],"We have floods, terrorist attacks, famines due...",0
2,AskIndia,https://www.reddit.com/r/india/comments/g0zlly...,Mother's condition is going worse due to hepat...,Can I get some updates and verification on thi...,"Hi folks, I really appreciate the warm respons...",0
3,AskIndia,https://www.reddit.com/r/india/comments/g4lrhm...,People stuck with their family during the lock...,>patriarchal father who could care less etc \...,I don't think we've spend so much time with fa...,0
4,AskIndia,https://www.reddit.com/r/india/comments/fvy95j...,Men who are 30+ and have decided not to get ma...,Get married. Indians are genetically engineere...,The corona virus has given me some time to thi...,0
5,AskIndia,https://www.reddit.com/r/india/comments/g1lmhg...,[Please Advice] Reality punched me in the face...,[deleted],"Sorry Reddit, this post is going to be long. P...",0
6,AskIndia,https://www.reddit.com/r/india/comments/g42vfo...,What is the PM CARES fund being used for? how ...,Someone had created a petition for this on cha...,"In this need of the hour, many citizens are co...",0
7,AskIndia,https://www.reddit.com/r/india/comments/g0igt7...,"r/India: If money is no bar, would you prefer ...",This kind of questions are always asked and an...,Seems like everybody here is very critical of ...,0
8,AskIndia,https://www.reddit.com/r/india/comments/g590ut...,"So, I'm an American dating a South Indian, and...",[removed],"So, I'm dating a grad student from Coimbature....",0
9,AskIndia,https://www.reddit.com/r/india/comments/g2xjhd...,Any idea what has happened to quora,It's quite ironic this post comming from r/india,Hey guys. I used to read a lot on quora before...,0


## Performing text analysis

In [13]:
# Import nltk stopwords as done in the previous notebook as well
STOPWORDS = nltk.corpus.stopwords.words('english')
print(STOPWORDS)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
# Import nltk punctuation which will be removed from our texts as well
# nltk.download('punkt')
# PUNCT = nltk.corpus.stopwords.words('punkt')
# print(PUNCT)

## Working Dataset

For this, I will be combining the data present in the body, Title and the Comments. 

This is giving me a very weird problem. 
All comparison with nan are returning to be false. np.NaN is not working so I am trying a different approach. I am comparing the value with float or str to determine the existence of null value. Float = Null. str = something is present. 

In [15]:
# for i in range(len(data)):
#     print(type(data.iloc[i]['Body']))

In [16]:
data['Combine'] = data['Title'] # Create a column combined
count = 0
for i in range(len(data)):
    if type(data.loc[i]['Body']) != float:
        data['Combine'][i] = data['Combine'][i] + ' ' + data['Body'][i]

    if type(data.loc[i]['Comments']) != float:
        data['Combine'][i] = data['Combine'][i] + ' ' + data['Comments'][i]

data.head(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Flair,URL,Title,Comments,Body,id,Combine
0,AskIndia,https://www.reddit.com/r/india/comments/g014wc...,"Lost my Job, Sick Mother and Paralysed Dad, In...","Learn Python, then Django. Php might take a lo...",Hi....It's really tough time for everyone. I r...,0,"Lost my Job, Sick Mother and Paralysed Dad, In..."
1,AskIndia,https://www.reddit.com/r/india/comments/fxofyu...,Why does the government come with a begging bo...,[removed],"We have floods, terrorist attacks, famines due...",0,Why does the government come with a begging bo...
2,AskIndia,https://www.reddit.com/r/india/comments/g0zlly...,Mother's condition is going worse due to hepat...,Can I get some updates and verification on thi...,"Hi folks, I really appreciate the warm respons...",0,Mother's condition is going worse due to hepat...
3,AskIndia,https://www.reddit.com/r/india/comments/g4lrhm...,People stuck with their family during the lock...,>patriarchal father who could care less etc \...,I don't think we've spend so much time with fa...,0,People stuck with their family during the lock...
4,AskIndia,https://www.reddit.com/r/india/comments/fvy95j...,Men who are 30+ and have decided not to get ma...,Get married. Indians are genetically engineere...,The corona virus has given me some time to thi...,0,Men who are 30+ and have decided not to get ma...
5,AskIndia,https://www.reddit.com/r/india/comments/g1lmhg...,[Please Advice] Reality punched me in the face...,[deleted],"Sorry Reddit, this post is going to be long. P...",0,[Please Advice] Reality punched me in the face...
6,AskIndia,https://www.reddit.com/r/india/comments/g42vfo...,What is the PM CARES fund being used for? how ...,Someone had created a petition for this on cha...,"In this need of the hour, many citizens are co...",0,What is the PM CARES fund being used for? how ...
7,AskIndia,https://www.reddit.com/r/india/comments/g0igt7...,"r/India: If money is no bar, would you prefer ...",This kind of questions are always asked and an...,Seems like everybody here is very critical of ...,0,"r/India: If money is no bar, would you prefer ..."
8,AskIndia,https://www.reddit.com/r/india/comments/g590ut...,"So, I'm an American dating a South Indian, and...",[removed],"So, I'm dating a grad student from Coimbature....",0,"So, I'm an American dating a South Indian, and..."
9,AskIndia,https://www.reddit.com/r/india/comments/g2xjhd...,Any idea what has happened to quora,It's quite ironic this post comming from r/india,Hey guys. I used to read a lot on quora before...,0,Any idea what has happened to quora Hey guys. ...


In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1650 entries, 0 to 1649
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Flair     1650 non-null   object
 1   URL       1650 non-null   object
 2   Title     1650 non-null   object
 3   Comments  1557 non-null   object
 4   Body      635 non-null    object
 5   id        1650 non-null   int64 
 6   Combine   1650 non-null   object
dtypes: int64(1), object(6)
memory usage: 90.4+ KB


In [18]:
data.loc[34]['Combine']

"Solve my family dispute! In my country, the quarantine began nearly around 15 march and I've been home since then along with my brother, mother, father and grandmother. My father has been abusing all of us mentally since the quarantine began and Today he tried to beat the three of us (me, brother and mother) with a stick but we were successfully able to defend but got minor injuries and then the community people arrived. And somehow the dispute was solved ( as we all thought ) but since then he's really angry and I'm truly afraid of my life. He threatened that he'll kill us and all of my mother's family. Please guide me on how to get help and solve this! I'm unable to understand who to contact and how to go forward. My mother has agreed for a divorce but we'll apply for that once this gets over.\nPlease don't joke around as I'm genuinely afraid and I'm literally crying and shaking all the time.\n\nFollow up because a lot of people are talking about calling the police: We called the po

### Cleaning our data that will be used as an input

In [19]:
REPLACE_SPACES = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS = re.compile('[^0-9a-z #+_]')

There are certain symbols which add no analytical value to the data. Similarly, there certain areas where there are extra spaces or bracket spaces which are being being substituted by just one space.

In [20]:
def clean_text(text):
    '''
        text: a string
        
        return: modified initial string
        
    '''

    text = text.lower() # lowercase text
    text = REPLACE_SPACES.sub(' ', text) 
    text = BAD_SYMBOLS.sub('', text) # Replace Bad Symbols which 
    text = text.replace('x', '')
    
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

data['Combine'] = data['Combine'].apply(clean_text)
data['Combine'] = data['Combine'].str.replace('\d+', '')

In [21]:
data['Combine']

0       lost job sick mother paralysed dad lockdown ea...
1       government come begging bowl every crisis floo...
2       mothers condition going worse due hepatitis b ...
3       people stuck family lockdown family falling ap...
4       men + decided get married plan old age corona ...
                              ...                        
1645    iama person suffering bipola r disorder ama hi...
1646    identity policy privacy hi reddit community pl...
1647    hi r india cartoonist sumit kumar author amar ...
1648    hi reddit ulrike janwaar castle ask anything p...
1649    hey guys harsh rajat started entrepreneur jour...
Name: Combine, Length: 1650, dtype: object

### TFIDF Feature Importance

TFIDF =  Term Frequency–Inverse Document Frequency.
It is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus.

In [22]:
# Creating an instance of the Tfidf vectorizer
# I will be performing a hyperparameter tuning soon
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, 
                        stop_words=STOPWORDS, 
                        norm = 'l2', 
                        encoding='latin-1', 
                        ngram_range=(1, 2))

In [23]:
data.tail()

Unnamed: 0,Flair,URL,Title,Comments,Body,id,Combine
1645,AMA,https://www.reddit.com/r/india/comments/2oytx7...,IAMA person suffering from Bipola[r] Disorder....,Is your zodiac sign Gemini?,Hi all. I alternate between feeling like Einst...,10,iama person suffering bipola r disorder ama hi...
1646,AMA,https://www.reddit.com/r/india/comments/4bmka5...,"Identity, policy, and privacy",[deleted],Hi Reddit community! It’s a pleasure to be her...,10,identity policy privacy hi reddit community pl...
1647,AMA,https://www.reddit.com/r/india/comments/3rhufx...,"Hi /r/India, I am cartoonist Sumit Kumar autho...",late to the party and i have no questions for ...,Edit : Going to sleep now. Big dhanyawaad for ...,10,hi r india cartoonist sumit kumar author amar ...
1648,AMA,https://www.reddit.com/r/india/comments/4yc03a...,"Hi Reddit, this is XUlrike from Janwaar Castle...",[Your organization’s logo](https://janwaar-cas...,The purpose of the Janwaar Castle Community Or...,10,hi reddit ulrike janwaar castle ask anything p...
1649,AMA,https://www.reddit.com/r/india/comments/4vs9fj...,"Hey guys, I am Harsh Rajat, started my entrepr...","Okay then! this seems weird, more than that sh...",About Me: A little backstory about me: I start...,10,hey guys harsh rajat started entrepreneur jour...


In [24]:
# Extracting the features by fitting the Vectorizer on our Title data because that has the description of the post
feat = tfidf.fit_transform(data['Combine']).toarray()
print(feat.shape)

(1650, 3297)


Now, I need to look at the most correlated words with each category and list them. I am gonna look at monograms.

In [25]:
# chisq2 statistical test
N = 5    # Number of examples to be listed
for f, i in sorted(category_labels.items()):
    chi2_feat = chi2(feat, labels == i)
    indices = np.argsort(chi2_feat[0])
    feat_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [w for w in feat_names if len(w.split(' ')) == 1]
    print("\nFlair '{}':".format(f))
    print("Most correlated unigrams:\n\t. {}".format('\n\t. '.join(unigrams[-N:])))



Flair 'AMA':
Most correlated unigrams:
	. hi
	. anything
	. ask
	. questions
	. ama

Flair 'AskIndia':
Most correlated unigrams:
	. advice
	. dad
	. situation
	. afraid
	. family

Flair 'Business/Finance':
Most correlated unigrams:
	. firms
	. emi
	. hdfc
	. mukesh
	. bank

Flair 'Food':
Most correlated unigrams:
	. restaurant
	. chutney
	. recipe
	. chicken
	. food

Flair 'Non-Political':
Most correlated unigrams:
	. rural
	. dads
	. found
	. bored
	. comics

Flair 'Photography':
Most correlated unigrams:
	. mm
	. beach
	. nikon
	. shot
	. oc

Flair 'Policy/Economy':
Most correlated unigrams:
	. gdp
	. govt
	. investments
	. nirmala
	. economy

Flair 'Politics':
Most correlated unigrams:
	. sonia
	. removed
	. modi
	. arnab
	. muslims

Flair 'Science/Technology':
Most correlated unigrams:
	. vpn
	. iit
	. develop
	. zoom
	. users

Flair 'Sports':
Most correlated unigrams:
	. ipl
	. football
	. sports
	. cricket
	. cup

Flair '[R]eddiquette':
Most correlated unigrams:
	. creator
	. bo

### Model Input Preparation

In [26]:
flair_list = list(category_labels.keys())
flair_list

['AskIndia',
 'Non-Political',
 '[R]eddiquette',
 'Photography',
 'Science/Technology',
 'Politics',
 'Business/Finance',
 'Policy/Economy',
 'Sports',
 'Food',
 'AMA']

In [27]:
# Splitting 20% of the data into train test split
X_train, X_test, y_train, y_test = train_test_split(data['Combine'], data['Flair'], test_size=0.15, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(1402,) (1402,) (248,) (248,)


I randomized the training and testing data for better predictions. This is very important since the data has homogenous flairs for every 150 entries. 


## Building our classifiers

I will be building functions for different clasifiers. These functions will have a pipeline implemented for each model. This pipeline will first create an instance of the Count Vectorizer to create vectors of word counts and then it will also implement a TFID Transformer. 

**WRITE MORE HERE**

In [70]:
# Creating an instance of the TFID transformer
count_vec = CountVectorizer()
X_train_counts = count_vec.fit_transform(X_train)
print(X_train_counts)

  (0, 6503)	1
  (0, 4353)	1
  (0, 12864)	1
  (0, 5763)	1
  (0, 6553)	1
  (0, 3866)	1
  (0, 13979)	1
  (0, 673)	1
  (0, 12997)	1
  (1, 16669)	3
  (1, 7822)	9
  (1, 1259)	10
  (1, 9106)	11
  (1, 1220)	1
  (1, 322)	1
  (1, 13310)	6
  (1, 15414)	4
  (1, 13568)	16
  (1, 9227)	1
  (1, 14917)	3
  (1, 10981)	2
  (1, 1043)	1
  (1, 1843)	1
  (1, 11089)	1
  (1, 12052)	4
  :	:
  (1399, 6466)	1
  (1399, 12626)	1
  (1399, 14138)	1
  (1400, 7238)	2
  (1400, 14034)	1
  (1400, 15327)	1
  (1400, 5153)	1
  (1400, 2201)	1
  (1400, 6007)	2
  (1400, 13867)	1
  (1400, 1363)	1
  (1400, 570)	4
  (1400, 13215)	1
  (1400, 10563)	1
  (1400, 1443)	1
  (1400, 13709)	4
  (1400, 7880)	1
  (1401, 3799)	1
  (1401, 12679)	1
  (1401, 3316)	1
  (1401, 7300)	1
  (1401, 5849)	1
  (1401, 12471)	1
  (1401, 6067)	1
  (1401, 3773)	1


In [71]:
# Creating an instance of the TFID transformer
tfidf_trans = TfidfTransformer()
X_train_tfidf = tfidf_trans.fit_transform(X_train_counts)
print(X_train_tfidf)

  (0, 13979)	0.3013721880655323
  (0, 12997)	0.3051264927290089
  (0, 12864)	0.35123291271193524
  (0, 6553)	0.2537328263091896
  (0, 6503)	0.3732966348536559
  (0, 5763)	0.4110148169841684
  (0, 4353)	0.3889510948424478
  (0, 3866)	0.33557845272314335
  (0, 673)	0.2369563506043619
  (1, 16693)	0.013605063330183872
  (1, 16669)	0.03109231737890458
  (1, 16626)	0.021906120263269963
  (1, 16611)	0.019895830123547936
  (1, 16601)	0.021906120263269963
  (1, 16562)	0.019248661248696376
  (1, 16555)	0.08762448105307985
  (1, 16528)	0.01587524984410388
  (1, 16494)	0.008904356598326654
  (1, 16490)	0.01569942415674625
  (1, 16483)	0.017543941428934177
  (1, 16468)	0.010474604323949428
  (1, 16453)	0.012513189669717345
  (1, 16448)	0.044468215218926674
  (1, 16440)	0.01508657799745058
  (1, 16421)	0.04106740205107267
  :	:
  (1399, 337)	0.04108675028835383
  (1399, 152)	0.028853470309070536
  (1399, 45)	0.026536652130275482
  (1400, 15327)	0.10179062806331833
  (1400, 14034)	0.0995841191921416

In [51]:
# Model input Sequences
pre_train = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer())])

### Naive Bayes Classifier Pipeline
The first one that I am building is the Naive Bayes Classifier. The one most suitable for word counts is the multinomial variant

In [92]:
def nb_classifier(X_train, X_test, y_train, y_test):
    
    nb_fit = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', MultinomialNB()),
                 ])
    nb_fit.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = nb_fit.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

### Logistic Regression Model

In [93]:
def log_reg(X_train, X_test, y_train, y_test):
    
    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
    logreg.fit(X_train, y_train)     # Fitting the data to the trianing data

    # Making Predictions on the test data
    y_pred = logreg.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

### Random Forest Model

In [94]:
def random_forest(X_train, X_test, y_train, y_test):
    
    forest = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', RandomForestClassifier()),
                 ])
    forest.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = forest.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

### Linear SVC

In [102]:
def svc(X_train, X_test, y_train, y_test):
    
    forest = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', SVC()),
                 ])
    forest.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = forest.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

In [103]:
X_test_counts = count_vec.transform(X_test)
X_test_tfidf = tfidf_trans.transform(X_test_counts)

In [104]:
log_model = SVC()
log_model.fit(X_train_tfidf, y_train)

y_pred = log_model.predict(X_test_tfidf)
accuracy_score(y_pred=y_pred, y_true=y_test)

0.5564516129032258

### Making predictions.

In [106]:
print("Evaluate Naive Bayes Classifier")
nb_classifier(X_train, X_test, y_train, y_test)

print("Evaluate Random Forest Classifier")
random_forest(X_train, X_test, y_train, y_test)

print("Evaluate Logistic Regression Model")
log_reg(X_train, X_test, y_train, y_test)

print("Evaluate SVC Model")
svc(X_train, X_test, y_train, y_test)

Evaluate Naive Bayes Classifier
Model Accuracy: 0.532258064516129
Evaluate Random Forest Classifier
Model Accuracy: 0.47580645161290325
Evaluate Logistic Regression Model


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Model Accuracy: 0.5766129032258065
Evaluate SVC Model
Model Accuracy: 0.5564516129032258


Logistic Regression Model gives the best results and was working well in the flask app. Apart from that the model is not converging for some cases. However, Heroku results in a error that says that module is not found even though it is present there so I will be using SVC right now. 

### SVC Model Tuning

In [108]:
param_grid = {'C': [0.1,1, 10, 100], 
              'gamma': [1,0.1,0.01,0.001],
              'kernel': ['linear', 'rbf', 'poly', 'sigmoid']}

grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3) 
  
# fitting the model for grid search 
grid.fit(X_train_tfidf, y_train) 

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.096, total=   1.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.0s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.096, total=   1.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.9s remaining:    0.0s


[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.104, total=   1.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.096, total=   1.0s
[CV] C=0.1, gamma=1, kernel=linear ...................................
[CV] ....... C=0.1, gamma=1, kernel=linear, score=0.096, total=   1.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.096, total=   1.0s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.096, total=   1.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.100, total=   1.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .......... C=0.1, gamma=1, kernel=rbf, score=0.096, total=   1.1s
[CV] C=0.1, gamma=1, kernel=rbf ......................................
[CV] .

[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.096, total=   1.1s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.096, total=   1.2s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.104, total=   1.1s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.096, total=   1.0s
[CV] C=0.1, gamma=0.001, kernel=linear ...............................
[CV] ... C=0.1, gamma=0.001, kernel=linear, score=0.096, total=   1.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.096, total=   1.0s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] ...... C=0.1, gamma=0.001, kernel=rbf, score=0.096, total=   1.2s
[CV] C=0.1, gamma=0.001, kernel=rbf ..................................
[CV] .

[CV] ...... C=1, gamma=0.1, kernel=sigmoid, score=0.096, total=   1.0s
[CV] C=1, gamma=0.1, kernel=sigmoid ..................................
[CV] ...... C=1, gamma=0.1, kernel=sigmoid, score=0.096, total=   1.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.566, total=   0.9s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.562, total=   1.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.571, total=   1.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.550, total=   1.0s
[CV] C=1, gamma=0.01, kernel=linear ..................................
[CV] ...... C=1, gamma=0.01, kernel=linear, score=0.550, total=   1.0s
[CV] C=1, gamma=0.01, kernel=rbf .....................................
[CV] .

[CV] ....... C=10, gamma=1, kernel=sigmoid, score=0.559, total=   0.9s
[CV] C=10, gamma=1, kernel=sigmoid ...................................
[CV] ....... C=10, gamma=1, kernel=sigmoid, score=0.557, total=   0.9s
[CV] C=10, gamma=1, kernel=sigmoid ...................................
[CV] ....... C=10, gamma=1, kernel=sigmoid, score=0.539, total=   0.9s
[CV] C=10, gamma=1, kernel=sigmoid ...................................
[CV] ....... C=10, gamma=1, kernel=sigmoid, score=0.546, total=   0.9s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.559, total=   0.9s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.555, total=   0.9s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] ...... C=10, gamma=0.1, kernel=linear, score=0.561, total=   0.9s
[CV] C=10, gamma=0.1, kernel=linear ..................................
[CV] .

[CV] ...... C=10, gamma=0.001, kernel=poly, score=0.096, total=   1.0s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] ... C=10, gamma=0.001, kernel=sigmoid, score=0.096, total=   0.9s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] ... C=10, gamma=0.001, kernel=sigmoid, score=0.096, total=   0.9s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] ... C=10, gamma=0.001, kernel=sigmoid, score=0.100, total=   0.9s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] ... C=10, gamma=0.001, kernel=sigmoid, score=0.096, total=   0.9s
[CV] C=10, gamma=0.001, kernel=sigmoid ...............................
[CV] ... C=10, gamma=0.001, kernel=sigmoid, score=0.096, total=   0.9s
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] ....... C=100, gamma=1, kernel=linear, score=0.559, total=   0.9s
[CV] C=100, gamma=1, kernel=linear ...................................
[CV] .

[CV] ...... C=100, gamma=0.01, kernel=poly, score=0.100, total=   1.0s
[CV] C=100, gamma=0.01, kernel=poly ..................................
[CV] ...... C=100, gamma=0.01, kernel=poly, score=0.096, total=   1.0s
[CV] C=100, gamma=0.01, kernel=poly ..................................
[CV] ...... C=100, gamma=0.01, kernel=poly, score=0.096, total=   1.0s
[CV] C=100, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=100, gamma=0.01, kernel=sigmoid, score=0.566, total=   1.0s
[CV] C=100, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=100, gamma=0.01, kernel=sigmoid, score=0.562, total=   1.0s
[CV] C=100, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=100, gamma=0.01, kernel=sigmoid, score=0.571, total=   1.1s
[CV] C=100, gamma=0.01, kernel=sigmoid ...............................
[CV] ... C=100, gamma=0.01, kernel=sigmoid, score=0.550, total=   1.1s
[CV] C=100, gamma=0.01, kernel=sigmoid ...............................
[CV] .

[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed:  5.3min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [109]:
# print best parameter after tuning 
print(grid.best_params_) 

# print how our model looks after hyper-parameter tuning 
print(grid.best_estimator_) 


{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


#### Testing the best parameters 

In [110]:
def svc(X_train, X_test, y_train, y_test):
    
    svc_fit = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('model', SVC(C=1, gamma=1, kernel='linear')),
                 ])
    svc_fit.fit(X_train, y_train)    # Fitting the data to the trianing data
    
    # Making Predictions on the test data
    y_pred = svc_fit.predict(X_test)
    acc = accuracy_score(y_pred=y_pred, y_true=y_test)
    print("Model Accuracy: {}".format(acc))

In [112]:
print("Evaluate SVC Model")
svc(X_train, X_test, y_train, y_test)

Evaluate SVC Model
Model Accuracy: 0.6129032258064516


This gives us a 62% accuracy which is the best so far and I will go with this for now. 

### Random Forest Model Tuning

In [59]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid which will include the parameters we will be testing
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [60]:
# Look at the parameter list
from pprint import pprint
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [61]:
''' TAKES TIME TO EXECUTE SO SHOULDN'T EXECUTE AGAIN
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)

rf_random.fit(X_train_tfidf, y_train)
'''

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 10.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 21.6min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
               

#### Evaluation

In [81]:
# Function to evaluate Model performance
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(y_pred=predictions, y_true=test_labels)
    print('Model Performance')
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [82]:
# # Preparing test data
# X_test_counts = count_vec.transform(X_test)
# X_test_tfidf = tfidf_trans.transform(X_test_counts)
# base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
# base_model.fit(X_train_tfidf, y_train)
# base_accuracy = evaluate(base_model, X_test_tfidf , y_test)

Model Performance
Accuracy = 0.44%.


In [87]:
# best_random = rf_random.best_estimator_
# random_accuracy = evaluate(best_random, X_test_tfidf , y_test)

Model Performance
Accuracy = 0.52%.


I still get a 52% accuracy so I will go for an SVC model instead. 

## Saving the model for Deployment
Joblib is part of the SciPy ecosystem and provides utilities for pipelining Python jobs.



In [37]:
# nb_fit = Pipeline([('vect', CountVectorizer()),
#                   ('tfidf', TfidfTransformer()),
#                   ('model', MultinomialNB()),
#                  ])
# nb_fit.fit(X_train, y_train) 

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [86]:
import joblib 

filename = 'final_model.sav'
joblib.dump(best_random, filename)

['final_model.sav']

## Performance explanation