In [1]:
pip install xgboost




In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
import pickle
from sklearn.pipeline import Pipeline

## Loading the data

In [3]:
data=pd.read_csv('data.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,flair,title,score,id,url,comms_num,body,author,comments,timestamp,combined_features
0,0,Business/Finance,Interest PF taxable: Finance minister eye PF i...,51,laoy5g,https://timesofindia.indiatimes.com/business/i...,20,,satyasys,"PF return fully taxable, point investing PF. G...",2021-02-02 19:30:37,Interest PF taxable: Finance minister eye PF i...
1,1,Food,How survive 500rs(food) 2 weeks?,55,kr3ztg,https://www.reddit.com/r/india/comments/kr3ztg...,79,Hey guys. So time salary going late I'll recei...,Luc_90,"1. 2 kg cheap rice, 50/kg so, (Don't buy boile...",2021-01-06 07:40:59,How survive 500rs(food) 2 weeks?1. 2 kg cheap ...
2,2,Scheduled,"Right wing group labelling resource document ""...",143,lbwl1r,https://www.reddit.com/r/india/comments/lbwl1r...,17,"Recently Greta Thunberg tweeted ""toolkit"" peop...",gobargorab,Anything father modi make go crazy upset. That...,2021-02-04 09:27:18,"Right wing group labelling resource document ""..."
3,3,Food,Ask: What amount pocket money give children?,5,m0auzn,https://www.reddit.com/r/india/comments/m0auzn...,39,"Also, supposed buy pocket money actually spend...",what_is_inflation,You guy get pocket money? My parent never gave...,2021-03-08 21:25:43,Ask: What amount pocket money give children?Yo...
4,4,Photography,"I’ve Recently generated interest photography, ...",0,aaakn4,https://i.redd.it/9tcehs8vz0721.jpg,14,,thesarcasticpage,A photo like letter viewer story want convey. ...,2018-12-29 03:35:37,"I’ve Recently generated interest photography, ..."


In [5]:
data.fillna("",inplace = True)

In [6]:
data.shape

(2280, 12)

In [7]:
data.columns

Index(['Unnamed: 0', 'flair', 'title', 'score', 'id', 'url', 'comms_num',
       'body', 'author', 'comments', 'timestamp', 'combined_features'],
      dtype='object')

In [8]:
data.drop(['Unnamed: 0','id'],axis=1,inplace=True)

In [9]:
data.head()

Unnamed: 0,flair,title,score,url,comms_num,body,author,comments,timestamp,combined_features
0,Business/Finance,Interest PF taxable: Finance minister eye PF i...,51,https://timesofindia.indiatimes.com/business/i...,20,,satyasys,"PF return fully taxable, point investing PF. G...",2021-02-02 19:30:37,Interest PF taxable: Finance minister eye PF i...
1,Food,How survive 500rs(food) 2 weeks?,55,https://www.reddit.com/r/india/comments/kr3ztg...,79,Hey guys. So time salary going late I'll recei...,Luc_90,"1. 2 kg cheap rice, 50/kg so, (Don't buy boile...",2021-01-06 07:40:59,How survive 500rs(food) 2 weeks?1. 2 kg cheap ...
2,Scheduled,"Right wing group labelling resource document ""...",143,https://www.reddit.com/r/india/comments/lbwl1r...,17,"Recently Greta Thunberg tweeted ""toolkit"" peop...",gobargorab,Anything father modi make go crazy upset. That...,2021-02-04 09:27:18,"Right wing group labelling resource document ""..."
3,Food,Ask: What amount pocket money give children?,5,https://www.reddit.com/r/india/comments/m0auzn...,39,"Also, supposed buy pocket money actually spend...",what_is_inflation,You guy get pocket money? My parent never gave...,2021-03-08 21:25:43,Ask: What amount pocket money give children?Yo...
4,Photography,"I’ve Recently generated interest photography, ...",0,https://i.redd.it/9tcehs8vz0721.jpg,14,,thesarcasticpage,A photo like letter viewer story want convey. ...,2018-12-29 03:35:37,"I’ve Recently generated interest photography, ..."


In [10]:
data['flair'].value_counts()

Politics              244
Coronavirus           241
Food                  240
Scheduled             234
Business/Finance      230
AskIndia              226
Science/Technology    220
Photography           219
Policy/Economy        214
Non-Political         212
Name: flair, dtype: int64

In [11]:
flair = ['Politics','Coronavirus','Food','Scheduled','Business/Finance','AskIndia','Science/Technology'
        'Photography','Policy/Economy','Non-Political']

## Trying different models

## SVM

In [12]:
def linear_svm(X_train, X_test, y_train, y_test):
    sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge',alpha=1e-4, random_state=16, max_iter=5, tol=None)),
                 ])
    sgd.fit(X_train, y_train)

    y_pred = sgd.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,labels=flair))

## Random Forest

In [13]:
def randomforest(X_train, X_test, y_train, y_test):
    ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
    ranfor.fit(X_train, y_train)

    y_pred = ranfor.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,labels=flair))

## XG Boost

In [14]:
def xgbclassifier(X_train, X_test, y_train, y_test):  
    xgb_clf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', XGBClassifier(random_state=42, seed=2,n_estimators=1000,verbosity=1,objective='multi:softmax')),
                 ])
    xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,labels=flair))

## Logistic Regression

In [15]:
def logisticreg(X_train, X_test, y_train, y_test):

    from sklearn.linear_model import LogisticRegression

    logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
    logreg.fit(X_train, y_train)

    y_pred = logreg.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,labels=flair))

## Evaluating the model

In [16]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression\n")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest\n")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of XGB Classifier\n")
    xgbclassifier(X_train, X_test, y_train, y_test)

In [17]:
cat = data['flair']

V = data['combined_features']
W = data['comments']
X = data['title']
Y = data['body']
Z = data['url']

print("Flair Detection using Title as Feature\n")
train_test(X,cat)
print("Flair Detection using Body as Feature")
train_test(Y,cat)
print("Flair Detection using URL as Feature")
train_test(Z,cat)
print("Flair Detection using Comments as Feature")
train_test(W,cat)
print("Flair Detection using Combined Features")
train_test(V,cat)

Flair Detection using Title as Feature

Results of Linear Support Vector Machine
accuracy 0.6491228070175439
                               precision    recall  f1-score   support

                     Politics       0.57      0.66      0.61        47
                  Coronavirus       0.85      0.85      0.85        59
                         Food       0.67      0.57      0.61        46
                    Scheduled       0.58      0.59      0.59        49
             Business/Finance       0.40      0.45      0.43        42
                     AskIndia       0.81      0.74      0.78        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.43      0.40      0.42        47
                Non-Political       0.92      0.97      0.95        36

                    micro avg       0.65      0.65      0.65       373
                    macro avg       0.58      0.58      0.58       373
                 weighted avg       0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.6557017543859649
                               precision    recall  f1-score   support

                     Politics       0.68      0.64      0.66        47
                  Coronavirus       0.78      0.86      0.82        59
                         Food       0.71      0.70      0.70        46
                    Scheduled       0.64      0.55      0.59        49
             Business/Finance       0.40      0.43      0.41        42
                     AskIndia       0.81      0.72      0.76        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.43      0.47      0.45        47
                Non-Political       0.89      0.94      0.92        36

                    micro avg       0.67      0.66      0.67       373
                    macro avg       0.60      0.59      0.59       373
                 weighted avg       0.67      0.66      0.67       373

Results of Random Forest

accuracy 0.673245614

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.6491228070175439
                               precision    recall  f1-score   support

                     Politics       0.53      0.64      0.58        47
                  Coronavirus       0.82      0.86      0.84        59
                         Food       0.76      0.63      0.69        46
                    Scheduled       0.64      0.51      0.57        49
             Business/Finance       0.41      0.45      0.43        42
                     AskIndia       0.83      0.74      0.79        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.42      0.45      0.43        47
                Non-Political       0.97      0.92      0.94        36

                    micro avg       0.66      0.65      0.66       373
                    macro avg       0.60      0.58      0.59       373
                 weighted avg       0.67      0.65      0.66       373

Flair Detection using Body as Feature
Results 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.3355263157894737
                               precision    recall  f1-score   support

                     Politics       0.38      0.06      0.11        47
                  Coronavirus       0.40      0.10      0.16        59
                         Food       0.60      0.20      0.30        46
                    Scheduled       0.38      0.16      0.23        49
             Business/Finance       0.59      0.57      0.58        42
                     AskIndia       0.51      0.43      0.47        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.61      0.53      0.57        47
                Non-Political       0.26      0.14      0.18        36

                    micro avg       0.50      0.27      0.35       373
                    macro avg       0.41      0.24      0.29       373
                 weighted avg       0.47      0.27      0.32       373

Results of Logistic Regression



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.36622807017543857
                               precision    recall  f1-score   support

                     Politics       0.17      0.77      0.28        47
                  Coronavirus       0.38      0.10      0.16        59
                         Food       0.50      0.15      0.23        46
                    Scheduled       0.38      0.20      0.27        49
             Business/Finance       0.60      0.57      0.59        42
                     AskIndia       0.40      0.45      0.42        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.63      0.40      0.49        47
                Non-Political       0.35      0.19      0.25        36

                    micro avg       0.32      0.35      0.33       373
                    macro avg       0.38      0.32      0.30       373
                 weighted avg       0.42      0.35      0.33       373

Results of Random Forest



  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.41228070175438597
                               precision    recall  f1-score   support

                     Politics       0.17      0.77      0.28        47
                  Coronavirus       0.83      0.08      0.15        59
                         Food       0.60      0.13      0.21        46
                    Scheduled       0.65      0.27      0.38        49
             Business/Finance       0.65      0.67      0.66        42
                     AskIndia       0.42      0.70      0.53        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.85      0.47      0.60        47
                Non-Political       0.36      0.11      0.17        36

                    micro avg       0.36      0.39      0.38       373
                    macro avg       0.50      0.35      0.33       373
                 weighted avg       0.58      0.39      0.37       373

Results of XGB Classifier



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.43640350877192985
                               precision    recall  f1-score   support

                     Politics       0.17      0.77      0.28        47
                  Coronavirus       0.80      0.14      0.23        59
                         Food       0.55      0.24      0.33        46
                    Scheduled       0.59      0.33      0.42        49
             Business/Finance       0.80      0.67      0.73        42
                     AskIndia       0.48      0.53      0.51        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.90      0.55      0.68        47
                Non-Political       0.24      0.14      0.18        36

                    micro avg       0.38      0.42      0.40       373
                    macro avg       0.50      0.37      0.37       373
                 weighted avg       0.58      0.42      0.42       373

Flair Detection using URL as Feature
Results 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.35526315789473684
                               precision    recall  f1-score   support

                     Politics       0.55      0.34      0.42        47
                  Coronavirus       0.67      0.59      0.63        59
                         Food       0.48      0.33      0.39        46
                    Scheduled       0.56      0.31      0.39        49
             Business/Finance       0.42      0.19      0.26        42
                     AskIndia       0.19      0.96      0.32        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.33      0.13      0.18        47
                Non-Political       0.43      0.17      0.24        36

                    micro avg       0.34      0.39      0.37       373
                    macro avg       0.40      0.33      0.32       373
                 weighted avg       0.46      0.39      0.37       373

Results of Random Forest

accuracy 0.30482456

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.2719298245614035
                               precision    recall  f1-score   support

                     Politics       0.48      0.23      0.31        47
                  Coronavirus       0.74      0.59      0.66        59
                         Food       0.50      0.17      0.26        46
                    Scheduled       0.27      0.14      0.19        49
             Business/Finance       0.14      0.79      0.23        42
                     AskIndia       0.03      0.02      0.03        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.36      0.09      0.14        47
                Non-Political       0.46      0.17      0.24        36

                    micro avg       0.26      0.28      0.27       373
                    macro avg       0.33      0.24      0.23       373
                 weighted avg       0.38      0.28      0.27       373

Flair Detection using Comments as Feature
Resu

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.43640350877192985
                               precision    recall  f1-score   support

                     Politics       0.45      0.53      0.49        47
                  Coronavirus       0.42      0.58      0.49        59
                         Food       0.69      0.54      0.61        46
                    Scheduled       0.38      0.41      0.39        49
             Business/Finance       0.50      0.62      0.55        42
                     AskIndia       0.24      0.17      0.20        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.44      0.30      0.35        47
                Non-Political       0.17      0.14      0.15        36

                    micro avg       0.42      0.42      0.42       373
                    macro avg       0.37      0.37      0.36       373
                 weighted avg       0.42      0.42      0.41       373

Results of Logistic Regression



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.44298245614035087
                               precision    recall  f1-score   support

                     Politics       0.52      0.55      0.54        47
                  Coronavirus       0.44      0.61      0.51        59
                         Food       0.68      0.59      0.63        46
                    Scheduled       0.43      0.41      0.42        49
             Business/Finance       0.52      0.55      0.53        42
                     AskIndia       0.24      0.26      0.24        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.40      0.30      0.34        47
                Non-Political       0.20      0.17      0.18        36

                    micro avg       0.43      0.44      0.44       373
                    macro avg       0.38      0.38      0.38       373
                 weighted avg       0.43      0.44      0.43       373

Results of Random Forest



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.46710526315789475
                               precision    recall  f1-score   support

                     Politics       0.41      0.60      0.48        47
                  Coronavirus       0.43      0.71      0.54        59
                         Food       0.67      0.61      0.64        46
                    Scheduled       0.48      0.27      0.34        49
             Business/Finance       0.53      0.62      0.57        42
                     AskIndia       0.33      0.32      0.33        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.48      0.21      0.29        47
                Non-Political       0.33      0.33      0.33        36

                    micro avg       0.45      0.47      0.46       373
                    macro avg       0.41      0.41      0.39       373
                 weighted avg       0.46      0.47      0.44       373

Results of XGB Classifier



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.42105263157894735
                               precision    recall  f1-score   support

                     Politics       0.44      0.47      0.45        47
                  Coronavirus       0.43      0.68      0.52        59
                         Food       0.59      0.50      0.54        46
                    Scheduled       0.42      0.37      0.39        49
             Business/Finance       0.55      0.55      0.55        42
                     AskIndia       0.31      0.26      0.28        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.43      0.28      0.34        47
                Non-Political       0.15      0.19      0.17        36

                    micro avg       0.41      0.42      0.42       373
                    macro avg       0.37      0.37      0.36       373
                 weighted avg       0.42      0.42      0.41       373

Flair Detection using Combined Features
Resul

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.7302631578947368
                               precision    recall  f1-score   support

                     Politics       0.67      0.64      0.65        47
                  Coronavirus       0.82      0.90      0.85        59
                         Food       0.77      0.72      0.74        46
                    Scheduled       0.84      0.76      0.80        49
             Business/Finance       0.71      0.76      0.74        42
                     AskIndia       0.58      0.55      0.57        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.63      0.70      0.67        47
                Non-Political       0.67      0.67      0.67        36

                    micro avg       0.71      0.72      0.72       373
                    macro avg       0.63      0.63      0.63       373
                 weighted avg       0.72      0.72      0.72       373

Results of Logistic Regression



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy 0.7083333333333334
                               precision    recall  f1-score   support

                     Politics       0.69      0.66      0.67        47
                  Coronavirus       0.83      0.88      0.85        59
                         Food       0.72      0.72      0.72        46
                    Scheduled       0.69      0.67      0.68        49
             Business/Finance       0.69      0.81      0.75        42
                     AskIndia       0.45      0.51      0.48        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.67      0.60      0.63        47
                Non-Political       0.69      0.69      0.69        36

                    micro avg       0.68      0.70      0.69       373
                    macro avg       0.60      0.62      0.61       373
                 weighted avg       0.68      0.70      0.69       373

Results of Random Forest



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.8048245614035088
                               precision    recall  f1-score   support

                     Politics       0.65      0.74      0.69        47
                  Coronavirus       0.88      0.95      0.91        59
                         Food       0.90      0.78      0.84        46
                    Scheduled       0.88      0.76      0.81        49
             Business/Finance       0.64      0.86      0.73        42
                     AskIndia       0.81      0.83      0.82        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.79      0.57      0.67        47
                Non-Political       0.84      0.89      0.86        36

                    micro avg       0.79      0.80      0.80       373
                    macro avg       0.71      0.71      0.70       373
                 weighted avg       0.80      0.80      0.80       373

Results of XGB Classifier



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


accuracy 0.8486842105263158
                               precision    recall  f1-score   support

                     Politics       0.69      0.70      0.69        47
                  Coronavirus       0.98      0.95      0.97        59
                         Food       0.80      0.78      0.79        46
                    Scheduled       0.91      0.84      0.87        49
             Business/Finance       0.78      0.86      0.82        42
                     AskIndia       0.93      0.89      0.91        47
Science/TechnologyPhotography       0.00      0.00      0.00         0
               Policy/Economy       0.77      0.79      0.78        47
                Non-Political       0.89      0.94      0.92        36

                    micro avg       0.85      0.84      0.85       373
                    macro avg       0.75      0.75      0.75       373
                 weighted avg       0.85      0.84      0.85       373



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Saving the model

In [18]:
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(V, cat, test_size=0.2, random_state = 42)
model = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', xgb.sklearn.XGBClassifier(random_state=42,n_estimators=1000,verbosity=1, seed=2,objective='multi:softmax')),
                  ])
XGB = model.fit(X_train, y_train)





In [21]:
# Saving the best model as pickle file which would be used in website

with open("pickle_xgboost_model.pkl", 'wb') as file:  
    pickle.dump(XGB, file)