In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from io import StringIO
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.externals import joblib
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.linear_model import Perceptron

In [None]:
df = pd.read_csv('train_tweets_char_clean.csv',names = ["userID","tweets"])
df = df[pd.notnull(df['tweets'])]
df.info()

In [None]:
col = ['userID', 'tweets']
df = df[col]
df.columns = ['userID', 'tweets']
df['category_id'] = df['userID'].factorize()[0]
category_id_df = df[['userID', 'category_id']].drop_duplicates().sort_values('category_id')
category_to_id = dict(category_id_df.values)
id_to_category = dict(category_id_df[['category_id', 'userID']].values)
df.head()

In [None]:
#TF-IDF Settings
tfidf = TfidfVectorizer(analyzer='char', stop_words='english',use_idf=True,max_features=137)

In [None]:
%%time
# X_train = df['tweets']
# y_train = df['userID']
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], df['userID'], test_size=0.1, random_state=2)

# tfidf.fit(df['tweets'])

In [None]:
%%time
xtrain_tfidf =  tfidf.fit_transform(X_train)
xtest_tfidf = tfidf.fit_transform(X_test)
count_vect = CountVectorizer(analyzer='word', stop_words='english')
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
x_train_tfidf = tfidf_transformer.fit_transform(xtrain_tfidf)
x_test_tfidf = tfidf_transformer.fit_transform(xtest_tfidf)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
print (xtrain_tfidf.shape, xtest_tfidf.shape)
print (x_train_tfidf.shape, x_test_tfidf.shape)
  

In [None]:
%%time
#Liner Model Logistic Regression
clf = LinearSVC(penalty="l2",class_weight='balanced',random_state=1).fit(x_train_tfidf, y_train)


In [None]:
clf.score(x_test_tfidf,y_test)

In [None]:
%%time
#Nearest Neighbors
clf = NearestCentroid(metric='manhattan')
clf.fit(X_train_tfidf, y_train)

In [None]:
%%time
#MLP Classifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train_tfidf,y_train)

In [None]:
%%time
#SVM
clf = NuSVC(nu=0.5, kernel='rbf', degree=3, gamma='auto_deprecated', coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, decision_function_shape='ovr', random_state=None)
clf.fit(X_train, y_train)

In [None]:
%%time
#Linear Perceptron
clf = Perceptron()
clf.fit(x_train_tfidf, y_train)

In [None]:
print(clf.predict(tfidf.transform(["RT @handle: Director of Global Brand Marketing, Hotels and Casino's $125k + 30% bonus - Orlando Fl http://bit.ly/4kUmBB #jobs #twitjobs"])))


In [None]:
import re
def clean_tweet_text(tweet):
    text = re.sub(r'@\w+\s?', '', tweet)
    text = re.sub(r'http.?://[^\s]+[\s]?', '', text)
    text = re.sub('#\w+\s?', '', text)
    text = text.lower()
    return text

In [None]:
# only link pre processing
import re
def clean_tweet_text(tweet):
    text = re.sub(r'http.?://[^\s]+[\s]?', 'http-web-link', tweet)
    print (text)
    return text

In [None]:
%%time
line_num = 1
with open('./whodunnit/test_tweets_unlabeled.txt') as un_fd:
    with open('result_lin_svc_tfidf_full_wpp.txt','w') as res:
        for line in un_fd.readlines():
            clean_line = clean_tweet_text(line)
            userID = clf.predict(tfidf.transform([clean_line])).tolist()
            print (userID)
            res.write("%s\t%s\n"%(line_num,userID[0]))
            line_num += 1

In [None]:
print ((xtrain_tfidf))

In [None]:
print(clf)