In [2]:
import pandas as pd
df = pd.read_csv('whole_data.csv')

In [3]:
from io import StringIO

In [4]:
df = df.dropna()

In [5]:
col = ['text', 'class_name']

In [6]:
df = df[col]

In [7]:
df = df[pd.notnull(df['text'])]

In [8]:
df.columns = ['text','class_name']

In [9]:
import string
import nltk as nl
from nltk.corpus import stopwords 
import re
stop_words = set(stopwords.words('english')) 

df['text'] = df['text'].apply(lambda x: str(x).lower())
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('','', string.punctuation)))
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('','', string.digits)))
df['text'] = df['text'].apply(lambda x: re.sub("[^a-zA-Z0-9]+", " ", x))
df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [10]:
df['category_id'] = df['class_name'].factorize()[0]

In [11]:
category_id_df = df[['class_name','category_id']].drop_duplicates().sort_values('category_id')

In [12]:
category_to_id = dict(category_id_df.values)

In [13]:
id_to_category = dict(category_id_df[['category_id','class_name']].values)

In [14]:
df

Unnamed: 0,text,class_name,category_id
0,hello everyone happy sunday welcome latest spi...,Entertainment,0
1,put laptop,Entertainment,0
2,looks like sun wanted introduce look meet sola...,Entertainment,0
3,dear olivei part work culture highly toxic get...,Entertainment,0
7,unfortunately tis season sniffling,Entertainment,0
9,hello everyone moved new apartment last weeken...,Entertainment,0
13,previously,Entertainment,0
15,thanks reading feel feel check related humor,Entertainment,0
16,sending daughter college harder transition fig...,Entertainment,0
17,hey hottie looking good time little bit fire l...,Entertainment,0


In [15]:
import matplotlib.pyplot as plt

In [16]:
fig = plt.figure(figsize=(8,6))

In [17]:
df.groupby('class_name').text.count().plot.bar(ylim=0)

<matplotlib.axes._subplots.AxesSubplot at 0x7fcc0c8dfcf8>

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=7, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')


In [20]:
features = tfidf.fit_transform(df.text).toarray()


In [21]:
labels = df.category_id


In [22]:
features.shape


(1555, 14223)

In [23]:
from sklearn.feature_selection import chi2
import numpy as np

In [24]:
N=2
for Tag_Class, category_id in sorted(category_to_id.items()):
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}':".format(Tag_Class))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-N:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-N:])))

# 'Academics':
  . Most correlated unigrams:
. quantum
. code
  . Most correlated bigrams:
. data structures
. data structure
# 'Entertainment':
  . Most correlated unigrams:
. characters
. film
  . Most correlated bigrams:
. video game
. social media
# 'Life':
  . Most correlated unigrams:
. football
. food
  . Most correlated bigrams:
. football team
. minutes later
# 'Social':
  . Most correlated unigrams:
. emissions
. climate
  . Most correlated bigrams:
. fossil fuel
. climate change


In [25]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [26]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['category_id'],test_size=0.3, random_state = 0)
count_vect = CountVectorizer()
X_train

16      sending daughter college harder transition fig...
853     cannon cravens rides first bull ever professio...
1451    zerotolerance immigration policy trump adminis...
466     someone slightest bit math without doublecheck...
572     antonio regaladoit could anyone easy junjiu huang
658                                          sean captain
578     many people cooking following recipes series i...
748     like capture moments grab air taking pictures ...
1478    faintly remember feels believe god feeling lov...
569     around time started journey research replicati...
719     million people attending american college univ...
716     computer science graduate tutor teacher worked...
1550    hi hi hi mandarin duck herei seen news lot lat...
1134    lila thulinyou seen podium human brain mirror ...
728     lessons followed fairly basic format tweaking ...
1417    new yorkers grumbled way polls year facing lon...
843     hated every fiber beingin beginning merely sym...
468     great 

In [27]:
X_train_counts = count_vect.fit_transform(X_train.values.astype('U')).toarray()
X_train_counts.shape

(1088, 62726)

In [28]:
tfidf_transformer = TfidfTransformer()


In [29]:
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [30]:
model = MultinomialNB()

In [31]:
model.fit(X_train_tfidf, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [32]:
y_pred = model.predict(count_vect.transform(X_test.values.astype('U')).toarray())

In [33]:
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score
print(accuracy_score(y_pred,y_test))
print(f1_score(y_pred,y_test,average='weighted'))
print(precision_score(y_pred,y_test,average='weighted'))
print(recall_score(y_pred,y_test,average='weighted'))

0.655246252677
0.671651122131
0.719674152056
0.655246252677


In [34]:
from sklearn.cross_validation import KFold, cross_val_score



In [35]:
k_fold = KFold(len(X_train),n_folds=10, random_state=None, shuffle=False)
cross_val_score(model, X_train_tfidf, y_train, cv=k_fold, n_jobs=1)

array([ 0.66055046,  0.66972477,  0.66055046,  0.73394495,  0.69724771,
        0.57798165,  0.71559633,  0.58715596,  0.67592593,  0.62962963])

In [36]:
pr = pd.read_csv('original_data_classified.csv',encoding = "ISO-8859-1")

  interactivity=interactivity, compiler=compiler, result=result)


In [37]:
pr = pr[['original_data','class','url','index']]
pr

Unnamed: 0,original_data,class,url,index
0,Three Reasons Why Gun Ownership Rates From Oth...,Negative,https://medium.com/rollingstone/charles-manson...,6193
1,Adapted from a keynote address by Jennifer Ben...,Positive,https://medium.com/p/948dc358540?source=topic_...,11495
2,By Maggie EthridgeIâm writing this to you if...,Negative,,7386
3,âAddress should not determine accessâLast ...,Positive,https://medium.com/p/2f3eff18d6e2?source=topic...,13742
4,Illustration: Rebecca ClarkeIllustration: Rebe...,Positive,https://medium.com/p/27f4d83b4ca0?source=topic...,13997
5,Essay by Sarah Coefield.Itâs late August whe...,Positive,https://medium.com/p/197c2636fafb?source=topic...,10202
6,This entry is part of a project called Craigsl...,Positive,https://medium.com/the-junction/silly-sex-ritu...,4883
7,Scenario: You recently have been diagnosed wit...,Positive,https://medium.com/p/7c24bd243dc6?source=topic...,15002
8,The tipping point was 2008. I was content play...,Positive,https://medium.com/p/8dfef47faf27?source=topic...,19351
9,It is nigh unto impossible for a sequel to be ...,Positive,https://medium.com/p/4561a3064f85?source=topic...,20829


In [38]:
pr = pr[pr['class'] == 'Positive']

In [39]:
pr

Unnamed: 0,original_data,class,url,index
1,Adapted from a keynote address by Jennifer Ben...,Positive,https://medium.com/p/948dc358540?source=topic_...,11495
3,âAddress should not determine accessâLast ...,Positive,https://medium.com/p/2f3eff18d6e2?source=topic...,13742
4,Illustration: Rebecca ClarkeIllustration: Rebe...,Positive,https://medium.com/p/27f4d83b4ca0?source=topic...,13997
5,Essay by Sarah Coefield.Itâs late August whe...,Positive,https://medium.com/p/197c2636fafb?source=topic...,10202
6,This entry is part of a project called Craigsl...,Positive,https://medium.com/the-junction/silly-sex-ritu...,4883
7,Scenario: You recently have been diagnosed wit...,Positive,https://medium.com/p/7c24bd243dc6?source=topic...,15002
8,The tipping point was 2008. I was content play...,Positive,https://medium.com/p/8dfef47faf27?source=topic...,19351
9,It is nigh unto impossible for a sequel to be ...,Positive,https://medium.com/p/4561a3064f85?source=topic...,20829
11,"By Katrina FoxIn August this year, a former Bu...",Positive,https://medium.com/p/d5eed5e7b1fc?source=topic...,11089
13,In the summer of 2010 I started working part-t...,Positive,,7342


In [40]:
pr

Unnamed: 0,original_data,class,url,index
1,Adapted from a keynote address by Jennifer Ben...,Positive,https://medium.com/p/948dc358540?source=topic_...,11495
3,âAddress should not determine accessâLast ...,Positive,https://medium.com/p/2f3eff18d6e2?source=topic...,13742
4,Illustration: Rebecca ClarkeIllustration: Rebe...,Positive,https://medium.com/p/27f4d83b4ca0?source=topic...,13997
5,Essay by Sarah Coefield.Itâs late August whe...,Positive,https://medium.com/p/197c2636fafb?source=topic...,10202
6,This entry is part of a project called Craigsl...,Positive,https://medium.com/the-junction/silly-sex-ritu...,4883
7,Scenario: You recently have been diagnosed wit...,Positive,https://medium.com/p/7c24bd243dc6?source=topic...,15002
8,The tipping point was 2008. I was content play...,Positive,https://medium.com/p/8dfef47faf27?source=topic...,19351
9,It is nigh unto impossible for a sequel to be ...,Positive,https://medium.com/p/4561a3064f85?source=topic...,20829
11,"By Katrina FoxIn August this year, a former Bu...",Positive,https://medium.com/p/d5eed5e7b1fc?source=topic...,11089
13,In the summer of 2010 I started working part-t...,Positive,,7342


In [41]:
y_pred_new = model.predict(count_vect.transform(pr.original_data.values.astype('U')).toarray())
y_pred_new
len(y_pred_new)

3307

In [42]:
pr['topic']=y_pred_new


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [43]:
pr.to_csv('topic_classified_test_data.csv',index=False)
#  df.to_csv(filename, index=False)

In [44]:
from sklearn.svm import LinearSVC


In [45]:
svc_model = LinearSVC()
svc_model.fit(X_train_tfidf, y_train)
y_pred_svc = svc_model.predict(count_vect.transform(X_test.values.astype('U')).toarray())
k_fold = KFold(len(X_train),n_folds=10, random_state=None, shuffle=True)
cross_val_score(svc_model, X_train_tfidf, y_train, cv=k_fold, n_jobs=1)

array([ 0.67889908,  0.67889908,  0.76146789,  0.69724771,  0.73394495,
        0.69724771,  0.70642202,  0.64220183,  0.7037037 ,  0.7037037 ])

In [46]:
from sklearn.metrics import average_precision_score
print(accuracy_score(y_pred_svc,y_test))
print(f1_score(y_pred_svc,y_test,average='weighted'))
print(precision_score(y_pred_svc,y_test,average='weighted'))
print(recall_score(y_pred_svc,y_test,average='weighted'))

0.640256959315
0.641051163297
0.67087489842
0.640256959315
