**Importing libraries**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
import pickle

**Fetching the Data**

In [2]:
df=pd.read_csv('/content/drive/My Drive/dataset.csv')

In [3]:
df.head(10)

Unnamed: 0,title,score,id,url,comms_num,body,flair,timestamp,cleanbody,cleantitle
0,Covid 19 - Parliamentary Standing Committee on...,4,fyad3a,https://www.reddit.com/r/india/comments/fyad3a...,0,The Parliamentary Standing Committee on Scienc...,Science & Technology,2020-04-10 14:48:16,parlia stand commit sci technolog environ fore...,covid parlia stand commit sci technolog invit ...
1,[NP]FIITJEE - An institute that is supposed to...,251,7h8luq,https://i.redd.it/cex76h7a2o101.jpg,61,,Science & Technology,2017-12-03 16:19:29,,fiits institut suppos teach sci technolog u hi...
2,What are the websites that cover science & tec...,20,5vouxk,https://www.reddit.com/r/india/comments/5vouxk...,16,"Like verge.com, wired.com etc.",Science & Technology,2017-02-23 14:27:51,lik verg com wir com etc,websit cov sci technolog new focus ind
3,[Science & Technology] The secret to longevity...,2,8o1enm,https://www.mcgill.ca/newsroom/channels/news/s...,2,,Science & Technology,2018-06-02 23:43:55,,sci technolog secret longev microbiom gut
4,[Science & Technology] 20 important discoverie...,30,7hhkem,https://a.msn.com/r/2/BBG3HPi?m=en-in,0,,Science & Technology,2017-12-04 22:16:18,,sci technolog import discovery
5,"barkha dutt on Twitter: ""So JP Nadda is the ne...",17,2ls183,https://twitter.com/BDUTT/status/5315062636588...,18,,Science & Technology,2014-11-10 02:00:13,,barkh dut twit nadd new heal min drharshvardha...
6,Centre grants Rs 11 cr to Indian Institute of ...,4,6bvpzj,https://twitter.com/EconomicTimes/status/86507...,2,,Science & Technology,2017-05-18 20:01:38,,cent grant ind institut engin sci technolog ii...
7,A place to discuss scientific and engineering ...,5,5jscfe,https://www.reddit.com/r/india/comments/5jscfe...,3,I've been thinking about this for a while. It ...,Science & Technology,2016-12-23 03:04:49,think would illumin regul discus adv sci engin...,plac discus sci engin research ind sci technolog
8,India to Capture 10% of Global Medical Technol...,40,41o8fp,http://www.prnewswire.co.in/news-releases/indi...,3,,Science & Technology,2016-01-19 19:36:41,,ind capt glob med technolog market harsh vardh...
9,Benefits of cow urine: Science & Technology Mi...,4,6nmxoy,http://www.thehindu.com/news/national/benefits...,1,,Science & Technology,2017-07-17 00:05:29,,benefit cow urin sci technolog min head panel ...


In [4]:
df.fillna("",inplace = True)

In [5]:
df['content']= df['cleantitle'] + ' ' + df['cleanbody']

In [6]:
X = df.content
y = df.flair

In [7]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y = encoder.fit_transform(y)

In [8]:
flairs={'Political','Non-political','Reddiquette','AskIndia','Science & Technology','Policy & Economy','Finance & Business','Sports and food','Photography','AMA'}

**Splitting the dataset**

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

**Trying out various models**

In [10]:
## Logistic Regression
def logisticreg(X_train, X_test, y_train, y_test):

  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression()),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [11]:
## Naive Bayes
def nb_classifier(X_train, X_test, y_train, y_test):

  nb= Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MultinomialNB()),
                 ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [12]:
## Random forest
def randomforest(X_train, X_test, y_train, y_test):
  ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  ranfor.fit(X_train, y_train)

  y_pred = ranfor.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [13]:
## XGBoost
def xgbclassifier(X_train, X_test, y_train, y_test):  
    xgb_clf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7,objective='multi:softmax')),
                 ])
    xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

In [14]:
logisticreg(X_train, X_test, y_train, y_test)

accuracy 0.7566607460035524
                      precision    recall  f1-score   support

         Reddiquette       1.00      0.80      0.89        46
    Policy & Economy       0.82      0.75      0.78        48
       Non-political       0.60      0.72      0.65        46
         Photography       0.93      0.76      0.83        49
  Finance & Business       1.00      0.67      0.80        49
           Political       0.69      0.60      0.64        48
            AskIndia       0.64      0.86      0.73        90
     Sports and food       1.00      0.57      0.72        23
Science & Technology       0.71      0.66      0.68        59
                 AMA       0.74      0.88      0.80       105

            accuracy                           0.76       563
           macro avg       0.81      0.73      0.76       563
        weighted avg       0.78      0.76      0.76       563



In [15]:
nb_classifier(X_train, X_test, y_train, y_test)

accuracy 0.5310834813499112
                      precision    recall  f1-score   support

         Reddiquette       1.00      0.26      0.41        46
    Policy & Economy       1.00      0.10      0.19        48
       Non-political       0.44      0.76      0.56        46
         Photography       1.00      0.04      0.08        49
  Finance & Business       1.00      0.49      0.66        49
           Political       0.84      0.33      0.48        48
            AskIndia       0.52      0.79      0.63        90
     Sports and food       1.00      0.39      0.56        23
Science & Technology       0.57      0.53      0.55        59
                 AMA       0.43      0.90      0.58       105

            accuracy                           0.53       563
           macro avg       0.78      0.46      0.47       563
        weighted avg       0.71      0.53      0.49       563



In [16]:
randomforest(X_train, X_test, y_train, y_test)

accuracy 0.8241563055062167
                      precision    recall  f1-score   support

         Reddiquette       0.93      0.89      0.91        46
    Policy & Economy       0.96      0.92      0.94        48
       Non-political       0.67      0.78      0.72        46
         Photography       0.92      0.92      0.92        49
  Finance & Business       0.97      0.73      0.84        49
           Political       0.71      0.67      0.69        48
            AskIndia       0.66      0.87      0.75        90
     Sports and food       1.00      0.65      0.79        23
Science & Technology       0.82      0.83      0.82        59
                 AMA       0.93      0.84      0.88       105

            accuracy                           0.82       563
           macro avg       0.86      0.81      0.83       563
        weighted avg       0.84      0.82      0.83       563



In [17]:
xgbclassifier(X_train, X_test, y_train, y_test)

accuracy 0.8543516873889876
                      precision    recall  f1-score   support

         Reddiquette       0.95      0.91      0.93        46
    Policy & Economy       0.98      0.96      0.97        48
       Non-political       0.76      0.83      0.79        46
         Photography       0.91      0.98      0.94        49
  Finance & Business       0.98      0.84      0.90        49
           Political       0.81      0.71      0.76        48
            AskIndia       0.65      0.89      0.75        90
     Sports and food       1.00      0.83      0.90        23
Science & Technology       0.94      0.80      0.86        59
                 AMA       0.93      0.82      0.87       105

            accuracy                           0.85       563
           macro avg       0.89      0.86      0.87       563
        weighted avg       0.87      0.85      0.86       563



Since XGBoost Classifier has the highest accuracy, we will go with it and save the model.

In [18]:
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 42)
model = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', xgb.sklearn.XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7,objective='multi:softmax')),
                  ])
XGB = model.fit(X_train, y_train)
pickle.dump(XGB,open("xgb.bin",'wb'))
y_pred = model.predict(X_test)