In [39]:
#importing required libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from gensim import utils
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raunak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
#importing dataset and removing null values
df=pd.read_csv('data.csv')
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,author,authors,body,comment,comms_num,created,flair,id,score,title,url,combined_features
0,dhavalcoholic,ICICIPruLifeIns,reposting lack activity r askindiahello last y...,dear policy holder dhavalcoholic request help ...,1,1386254000.0,AskIndia,1s57oi,1,need feedback insurance policy took xpost aski...,https://www.reddit.com/r/india/comments/1s57oi...,need feedback insurance policy took xpost aski...
1,amitkumarthakur,RAD-Business RAD-Business None barcam10 _snor...,24hrs local police station register case dont ...,calm downgo sp office town file grievance imme...,24,1554080000.0,AskIndia,b7pvwt,94,somebody want kill full family,https://www.reddit.com/r/india/comments/b7pvwt...,somebody want kill full familycalm downgo sp o...
2,FrustratedOCIHopeful,plshelpthedog ayyylmaaaoo Proper_Boysenberry ...,hello askindia first time poster long time lur...,honestly supervisor behaved exactly government...,27,1555361000.0,AskIndia,bdfid1,10,ambassador india takes back newly issued oci c...,https://www.reddit.com/r/india/comments/bdfid1...,ambassador india takes back newly issued oci c...
3,aloo_vs_bhaloo,vcdarklord tilismilis aloo_vs_bhaloo dogaa fo...,r tooafraidtoask india edition,modi control sex desires jerk someone else pro...,22,1566529000.0,AskIndia,cu1xn4,18,randians afraid ask,https://www.reddit.com/r/india/comments/cu1xn4...,randians afraid askmodi control sex desires je...
4,rushils,mrfreeze2000 avneesh_sethi dr_DCTR ocean_of_s...,us must watched viral video indian family stea...,flight going thailand usually trash always fil...,392,1564537000.0,AskIndia,cjv92h,337,r india whats entitled idiotic indian abroad s...,https://www.reddit.com/r/india/comments/cjv92h...,r india whats entitled idiotic indian abroad s...


In [14]:
#checking frequency of each flair
df['flair'].value_counts()

AskIndia              186
Business/Finance      132
AMA                   126
Food                   83
Sports                 81
Non-Political          81
Photography            75
Scheduled              64
Science/Technology     52
Politics               37
Policy/Economy         32
[R]eddiquette          26
Name: flair, dtype: int64

In [65]:
X=df['title']+df['body']+df['comment']
X=X.values
y=df['flair'].values

In [134]:
#filtering words and preprocessing them
ps=PorterStemmer()
all_posts=[]


for i in range(len(X)):
    post=re.sub('[^a-zA-Z]',' ',X[i])
    post=utils.to_unicode(post)
    post=post.lower()
    post=post.split()
    post=[ps.stem(word) for word in post if not word in set(stopwords.words('english'))]
    post=' '.join(post)
    all_posts.append(post)
df['clean_text']=pd.DataFrame(all_posts)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raunak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [150]:
#splitting in training and testing data
X_train,X_test,y_train,y_test=train_test_split(df.clean_text.values,df.flair.values,test_size=0.3,random_state=42)

In [151]:
#logistic regression
def logreg_model(X_train,X_test,y_train,y_test):
    print('Using Logistic Regression: ')
    model=Pipeline([('countvect',CountVectorizer()),
                   ('tfidf',TfidfTransformer()),
                   ('logistic',LogisticRegression())])
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))

In [152]:
#Naive Bayes
def NB_model(X_train,X_Test,y_train,y_test):
    print('Using Naive Bayes')
    model=Pipeline([('countvect',CountVectorizer()),
                   ('tfidf',TfidfTransformer()),
                   ('NB',MultinomialNB())])
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))

In [153]:
#Random Forest
def RandomForest_model(X_train,X_test,y_train,y_test):
    print('Using Random Forest')
    model=Pipeline([('countvect',CountVectorizer()),
                   ('tfidf',TfidfTransformer()),
                   ('RandomForest',RandomForestClassifier(n_estimators=500))])
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))

In [154]:
#Multi Layer Perceptron
def MLP_model(X_train,X_test,y_train,y_test):
    print('Using Multi Layer Perceptron')
    model=Pipeline([('countvect',CountVectorizer()),
                   ('tfidf',TfidfTransformer()),
                   ('mlp',MLPClassifier(hidden_layer_sizes=(50,50,30)))])
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(classification_report(y_test,y_pred))

In [155]:
#checking each model 
models=['logreg','NB','RandomForest','MLP']

for model in models:
    model_name=eval(model+'_model'+"(X_train,X_test,y_train,y_test)")
    model_name

Using Logistic Regression: 
                    precision    recall  f1-score   support

               AMA       0.91      0.89      0.90        46
          AskIndia       0.56      0.97      0.71        62
  Business/Finance       0.77      0.84      0.80        44
              Food       0.94      0.89      0.91        18
     Non-Political       0.92      0.55      0.69        22
       Photography       1.00      0.78      0.88        18
    Policy/Economy       0.75      0.33      0.46         9
          Politics       1.00      0.20      0.33         5
         Scheduled       1.00      0.73      0.85        15
Science/Technology       1.00      0.56      0.72        16
            Sports       0.96      0.79      0.86        28
     [R]eddiquette       1.00      0.10      0.18        10

          accuracy                           0.77       293
         macro avg       0.90      0.64      0.69       293
      weighted avg       0.84      0.77      0.77       293

Using Nai

  _warn_prf(average, modifier, msg_start, len(result))


                    precision    recall  f1-score   support

               AMA       0.96      0.57      0.71        46
          AskIndia       0.31      1.00      0.48        62
  Business/Finance       0.84      0.82      0.83        44
              Food       1.00      0.56      0.71        18
     Non-Political       0.00      0.00      0.00        22
       Photography       1.00      0.33      0.50        18
    Policy/Economy       0.00      0.00      0.00         9
          Politics       0.00      0.00      0.00         5
         Scheduled       1.00      0.13      0.24        15
Science/Technology       0.00      0.00      0.00        16
            Sports       1.00      0.25      0.40        28
     [R]eddiquette       0.00      0.00      0.00        10

          accuracy                           0.51       293
         macro avg       0.51      0.30      0.32       293
      weighted avg       0.61      0.51      0.46       293

Using Random Forest
                 

In [156]:
#running random forest(best model)
model=Pipeline([('countvect',CountVectorizer()),
               ('tfidf',TfidfTransformer()),
               ('randomForest',RandomForestClassifier(n_estimators=500))])
model.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('countvect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                        max_leaf_nodes=None, max_samples=None,
                                

In [157]:
#saving random forest 
pickle.dump(model,open('model.pkl','wb'))