In [89]:
import pandas as pd
import numpy as np

In [90]:
df=pd.read_csv('India_posts.csv')
df=pd.concat([df['title'],df['flair']],axis=1)
df.head()
len(df)

5938

In [91]:
df.dropna(inplace=True)
df.reset_index(drop=True,inplace=True)
df.head()


Unnamed: 0,title,flair
0,I like how ITC uses cardboards inside their Yi...,Non-Political
1,"Track your current location for COVID19, put y...",Coronavirus
2,Private labs to begin COVID-19 testing next we...,Coronavirus
3,"Dads, Indian Dads",Non-Political
4,Maharashtra district's zone classifications,Coronavirus


In [92]:
len(df)

5770

In [93]:
label_dict={'Politics':0,
            'Non-Political':1,
            'AskIndia':2,
            'Policy/Economy':3,
            'Business/Finance':4,
            'Science/Technology':5,
            'Scheduled':6,
            'Sports':7,
            'Food':8,
            'Photography':9,
            'CAA-NRC-NPR':10,
            'Coronavirus':11}

label_dict_count={'Politics':0,
            'Non-Political':0,
            'AskIndia':0,
            'Policy/Economy':0,
            'Business/Finance':0,
            'Science/Technology':0,
            'Scheduled':0,
            'Sports':0,
            'Food':0,
            'Photography':0,
            'CAA-NRC-NPR':0,
            'Coronavirus':0}

In [94]:
for i in range(len(df)):
    if df['flair'][i] not in label_dict:
        df.drop(i,inplace=True)
df.reset_index(drop=True,inplace=True)

In [95]:
len(df)
# df.reset_index(drop=True,inplace=True)

5513

In [96]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()
all_posts=[]


for i in range(len(df)):
    post=re.sub('[^a-zA-Z]',' ',df.iloc[i][0])
    post=post.lower()
    post=post.split()
    post=[ps.stem(word) for word in post if not word in set(stopwords.words('english'))]
    post=' '.join(post)
    all_posts.append(post)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Raunak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [97]:
df['clean text']=pd.DataFrame(all_posts)

In [98]:
df.tail()

Unnamed: 0,title,flair,clean text
5508,Where can I get information like number and pe...,AskIndia,get inform like number percentag hear impair v...
5509,Inside India's busiest Covid-19 hospital,Coronavirus,insid india busiest covid hospit
5510,How India’s railways are joining the fight aga...,Coronavirus,india railway join fight covid
5511,What are some of the best / unbiased sources o...,AskIndia,best unbias sourc onlin news india
5512,#IndiaCares,Non-Political,indiacar


In [99]:
def word_count(text):
    return len(str(text).split(' '))

df['word_count'] = df['title'].apply(word_count)
avg_wc = df.groupby('flair').mean().reset_index()
avg_wc[['flair','word_count']]

Unnamed: 0,flair,word_count
0,AskIndia,12.121996
1,Business/Finance,11.109589
2,CAA-NRC-NPR,13.222222
3,Coronavirus,14.343427
4,Food,9.518519
5,Non-Political,11.812155
6,Photography,12.292683
7,Policy/Economy,14.1625
8,Politics,12.691048
9,Scheduled,8.142857


In [100]:
for i in range(len(df)):
    if df['flair'][i] in label_dict:
        df['flair'][i]=label_dict[df['flair'][i]]
df.head()
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,title,flair,clean text,word_count
0,I like how ITC uses cardboards inside their Yi...,1,like itc use cardboard insid yippe noodl prote...,23
1,"Track your current location for COVID19, put y...",11,track current locat covid put locat name map s...,15
2,Private labs to begin COVID-19 testing next we...,11,privat lab begin covid test next week kiran ma...,15
3,"Dads, Indian Dads",1,dad indian dad,3
4,Maharashtra district's zone classifications,11,maharashtra district zone classif,4


In [102]:
from sklearn.feature_extraction.text import TfidfVectorizer

texts = df['clean text'].astype('str')

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), 
                                   min_df = 2, 
                                   max_df = .95)

X = tfidf_vectorizer.fit_transform(texts) #features
y = df['flair'].values #target
X=X.toarray()
y=y.astype(int)
print (X.shape)
print(y.shape)

(5513, 7087)
(5513,)


In [103]:
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, confusion_matrix
from scipy import interp
from itertools import cycle

In [104]:
model_dict = {'Dummy' : DummyClassifier(random_state=3),
              'Stochastic Gradient Descent' : SGDClassifier(random_state=3, loss='log'),
              'Random Forest': RandomForestClassifier(random_state=3),
              'Decsision Tree': DecisionTreeClassifier(random_state=3),
              'AdaBoost': AdaBoostClassifier(random_state=3),
              'Gaussian Naive Bayes': GaussianNB(),
              'K Nearest Neighbor': KNeighborsClassifier()}

#Train test split with stratified sampling for evaluation
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = .3, 
                                                    shuffle = True, 
                                                    stratify = y, 
                                                    random_state = 3)

#Function to get the scores for each model in a df
def model_score_df(model_dict):   
    model_name, ac_score_list, p_score_list, r_score_list, f1_score_list = [], [], [], [], []
    for k,v in model_dict.items():   
        model_name.append(k)
        v.fit(X_train, y_train)
        y_pred = v.predict(X_test)
        ac_score_list.append(accuracy_score(y_test, y_pred))
        p_score_list.append(precision_score(y_test, y_pred, average='macro'))
        r_score_list.append(recall_score(y_test, y_pred, average='macro'))
        f1_score_list.append(f1_score(y_test, y_pred, average='macro'))
        model_comparison_df = pd.DataFrame([model_name, ac_score_list, p_score_list, r_score_list, f1_score_list]).T
        model_comparison_df.columns = ['model_name', 'accuracy_score', 'precision_score', 'recall_score', 'f1_score']
        model_comparison_df = model_comparison_df.sort_values(by='f1_score', ascending=False)
    return model_comparison_df

model_score_df(model_dict)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Unnamed: 0,model_name,accuracy_score,precision_score,recall_score,f1_score
3,Decsision Tree,0.525998,0.394213,0.308185,0.334334
2,Random Forest,0.562273,0.430454,0.292221,0.31561
1,Stochastic Gradient Descent,0.596735,0.4191,0.269003,0.27713
5,Gaussian Naive Bayes,0.443168,0.283638,0.28658,0.274053
4,AdaBoost,0.3948,0.181099,0.173376,0.152568
6,K Nearest Neighbor,0.331923,0.21664,0.155933,0.134175
0,Dummy,0.244861,0.094254,0.0895449,0.091443
