In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import numpy as np

df = pd.read_csv('enhanced_train_tweets_2950.csv',names = ["userID","tweets"])
df.head()

Unnamed: 0,userID,tweets
0,8746,let's try and catch up live next week! going t...
1,2423,do not pay for white teeth! mom's whitening me...
2,564,"is awesome, but is more fun. you can learn a l..."
3,3039,not a cool joke! i got pretty pissed for a spl...
4,9661,ford vehicles called 'world-class' ford vehic...


In [2]:
df = df[pd.notnull(df['tweets'])]

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9297 entries, 0 to 9296
Data columns (total 2 columns):
userID    9297 non-null int64
tweets    9297 non-null object
dtypes: int64(1), object(1)
memory usage: 217.9+ KB


In [4]:
col = ['userID', 'tweets']
df = df[col]

In [5]:
df.columns

Index(['userID', 'tweets'], dtype='object')

In [6]:
# 'Product' = 'userID', 'Consumer_complaint_narrative' = 'tweets'
df.columns = ['userID', 'tweets']

In [7]:
df['category_id'] = df['userID'].factorize()[0]
category_id_df = df[['userID', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'userID']].values)

In [8]:
df.head()

Unnamed: 0,userID,tweets,category_id
0,8746,let's try and catch up live next week! going t...,0
1,2423,do not pay for white teeth! mom's whitening me...,1
2,564,"is awesome, but is more fun. you can learn a l...",2
3,3039,not a cool joke! i got pretty pissed for a spl...,3
4,9661,ford vehicles called 'world-class' ford vehic...,4


In [9]:
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.tweets).toarray()
labels = df.category_id
features.shape

(9297, 47270)

In [None]:
N = 2
check = 0
for Product, category_id in sorted(category_to_id.items()):
    print ("check",check,"catid",category_id)
    features_chi2 = chi2(features, labels == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    check += 1

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['userID'], random_state = 0)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
# clf = MultinomialNB().fit(X_train_tfidf, y_train)
clf = LinearSVC().fit(X_train_tfidf, y_train)

In [None]:
print(clf.predict(count_vect.transform(["RT @handle: Director of Global Brand Marketing, Hotels and Casino's $125k + 30% bonus - Orlando Fl http://bit.ly/4kUmBB #jobs #twitjobs"])))

In [None]:
clf_lin_ovr = LinearSVC(multi_class="ovr").fit(X_train_tfidf, y_train)

In [None]:
print(clf_lin_ovr.predict(count_vect.transform(["RT @handle: Director of Global Brand Marketing, Hotels and Casino's $125k + 30% bonus - Orlando Fl http://bit.ly/4kUmBB #jobs #twitjobs"])))

In [None]:
%%time
clf_lin_ovo = LinearSVC(multi_class="crammer_singer").fit(X_train_tfidf, y_train)

In [None]:
print(clf_lin_ovo.predict(count_vect.transform(["RT @handle: Director of Global Brand Marketing, Hotels and Casino's $125k + 30% bonus - Orlando Fl http://bit.ly/4kUmBB #jobs #twitjobs"])))

In [None]:
import re
def clean_tweet_text(tweet):
    text = re.sub(r'@\w+\s?', '', tweet)
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
#     text = re.sub(r'#\w+\s?', '', tweet)
    text = re.sub('[^a-zA-Z\s]\s?', '', text)
    text = text.lower()
    return text

In [None]:
line_num = 1
with open('./whodunnit/test_tweets_unlabeled.txt') as un_fd:
    with open('result_linear_svm_cs.txt','w') as res:
        for line in un_fd.readlines():
            clean_line = clean_tweet_text(line)
            userID = clf_lin_ovo.predict(count_vect.transform([clean_line])).tolist()
            print (userID)
            res.write("%s\t%s\n"%(line_num,userID[0]))
            line_num += 1