In [1]:
import pandas as pd
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,Serial Number,Content,User_Id,Username,Label
0,1,"""Sorry, if you come you will be immediately se...","""25073877""",realdonaldtrump,1
1,2,"""See you on Friday...Big Crowd!","""25073877""",realdonaldtrump,1
2,3,"""True!","""25073877""",realdonaldtrump,1
3,4,"""“NO PRESSURE”""","""25073877""",realdonaldtrump,1
4,5,"""Will be Great!","""25073877""",realdonaldtrump,1


In [2]:
df = df[pd.notnull(df['Content'])]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 106514 entries, 0 to 106513
Data columns (total 5 columns):
Serial Number    106514 non-null int64
Content          106514 non-null object
User_Id          106514 non-null object
Username         106514 non-null object
Label            106514 non-null int64
dtypes: int64(2), object(3)
memory usage: 4.9+ MB


In [4]:
col = ['Username', 'Content']
df = df[col]

In [5]:
df.columns

Index(['Username', 'Content'], dtype='object')

In [6]:
df.columns = ['Username', 'Content']

In [7]:
df['Label'] = df['Username'].factorize()[0]
from io import StringIO
category_id_df = df[['Username', 'Label']].drop_duplicates().sort_values('Label')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['Label', 'Username']].values)

In [8]:
df

Unnamed: 0,Username,Content,Label
0,realdonaldtrump,"""Sorry, if you come you will be immediately se...",0
1,realdonaldtrump,"""See you on Friday...Big Crowd!",0
2,realdonaldtrump,"""True!",0
3,realdonaldtrump,"""“NO PRESSURE”""",0
4,realdonaldtrump,"""Will be Great!",0
...,...,...,...
106509,cnn,"""A rafflesia that recently bloomed in a West S...",49
106510,cnn,"""Kentucky's Attorney General says he's asking ...",49
106511,cnn,"""The American Kennel Club has announced two ne...",49
106512,cnn,"""Google has disabled access of Xiaomi devices ...",49


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.Content).toarray()
labels = df.Label
features.shape

(106514, 33416)

In [10]:
from sklearn.feature_selection import chi2
import numpy as np

N = 2
for Product, category_id in sorted(category_to_id.items()):
  features_chi2 = chi2(features, labels == category_id)
  indices = np.argsort(features_chi2[0])
  feature_names = np.array(tfidf.get_feature_names())[indices]
  unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
  bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
  print("# '{}':".format(Product))
  print("  . Most correlated unigrams:\n       . {}".format('\n       . '.join(unigrams[-N:])))
  print("  . Most correlated bigrams:\n       . {}".format('\n       . '.join(bigrams[-N:])))

# 'akshaykumar':
  . Most correlated unigrams:
       . thebalachallenge
       . thank
  . Most correlated bigrams:
       . kareenakapoorkhan diljitdosanjh
       . diljitdosanjh advani_kiara
# 'arianagrande':
  . Most correlated unigrams:
       . thankunext
       . love
  . Most correlated bigrams:
       . thank thankunext
       . love sm
# 'barackobama':
  . Most correlated unigrams:
       . president
       . obama
  . Most correlated bigrams:
       . live president
       . president obama
# 'bbcbreaking':
  . Most correlated unigrams:
       . brexit
       . uk
  . Most correlated bigrams:
       . died aged
       . uk pm
# 'beingsalmankhan':
  . Most correlated unigrams:
       . ho
       . uniform
  . Most correlated bigrams:
       . remember need
       . country jai
# 'billgates':
  . Most correlated unigrams:
       . melinda
       . polio
  . Most correlated bigrams:
       . bit ly
       . http bit
# 'britneyspears':
  . Most correlated unigrams:
       . xo
 

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

X_train, X_test, y_train, y_test = train_test_split(df['Content'], df['Username'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.fit_transform(X_test)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)

clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [12]:
print(clf.predict(count_vect.transform(["Sorry, if you come you will be immediately sent back! "])))

['theellenshow']


In [13]:
print(clf.predict(count_vect.transform(["I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX"])))

['cnn']


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]
CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
  model_name = model.__class__.__name__
  accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
  for fold_idx, accuracy in enumerate(accuracies):
    entries.append((model_name, fold_idx, accuracy))
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])
cv_df



Unnamed: 0,model_name,fold_idx,accuracy
0,RandomForestClassifier,0,0.182705
1,RandomForestClassifier,1,0.192849
2,RandomForestClassifier,2,0.226046
3,RandomForestClassifier,3,0.204058
4,RandomForestClassifier,4,0.205647
5,LinearSVC,0,0.479835
6,LinearSVC,1,0.562922
7,LinearSVC,2,0.575935
8,LinearSVC,3,0.567182
9,LinearSVC,4,0.535191


In [13]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

<Figure size 640x480 with 1 Axes>