In [None]:
import pandas as pd 
import numpy as np 
import scipy as sp 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from nltk.corpus import stopwords
from scipy.sparse import hstack
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import re
from sklearn.decomposition import TruncatedSVD
from utils1 import *
import string
from pattern.en import suggest
import snowballstemmer
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))

In [None]:
file = "data/train_tweets.txt"
temp = []
with open(file, 'r') as data:
    for line in data:
        row = []
        line = line.replace('\t'," ")
        elem = line.strip().split(" ")
        row.append(elem[0])
        row.append(" ".join(elem[1:]))
        temp.append(row) 

## EDA and data manipulation

    Currently removing stop-words, lower case.

In [None]:
min_no_tweets = 5
threshold = 100

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

def text_process(text):
    
    tokenizer = RegexpTokenizer(r'\w+')
    text_processed = tokenizer.tokenize(text)
    text_processed = ' '.join(word for word in text_processed if word not in STOPWORDS)
#     porter_stemmer = PorterStemmer()
#     text_processed = [porter_stemmer.stem(word) for word in text_processed]
    return text_processed

In [None]:
def clean_df(tw):
    tw["Tweet"].replace(r'http.?://[^\s]+[\s]?','', regex=True,inplace=True)
    tw['Tweet'] = tw['Tweet'].str.lower()
    tw["Tweet"].replace(r"@\S+", " ", regex=True ,inplace=True)
#     tw["Tweet"].replace(r"(\d{1,2})[/.-](\d{1,2})[/.-](\d{2,4})+", "DATE", regex=True,inplace=True)
#     tw["Tweet"].replace(r"(\d{1,2})[/:](\d{2})[/:](\d{2})?(am|pm)+", "TIME", regex=True,inplace=True)
#     tw["Tweet"].replace(r"(\d{1,2})[/:](\d{2})?(am|pm)+", "TIME", regex=True,inplace=True)
#     tw["Tweet"].replace(r"\d+", "NUM", regex=True,inplace=True)
    tw["Tweet"].replace('[^a-zA-Z\s]', '', regex=True,inplace=True)
    tw['num_of_words'] = tw["Tweet"].str.split().apply(len)
    tw.drop(tw[tw.num_of_words<4].index, inplace=True)
    return tw

In [None]:
tw = pd.DataFrame(temp,columns = ["User","Tweet"])
tw = clean_df(tw)

In [None]:
cnt_user = tw['User'].value_counts()
cnt_user.describe()

In [None]:
tw.sample(10,random_state = 0)

In [None]:
print(tw.shape)

# Feature extraction

    Using TF-IDF and without sampling data

In [None]:
df = pd.DataFrame(cnt_user)
top_user = df[df['User'] >= min_no_tweets].index.tolist()
top_k = tw[tw.User.isin(top_user)]
data = top_k['User'].value_counts()
data.describe()

In [None]:
Tweet = top_k.groupby('User',group_keys=False).apply(lambda x: x.sample(n = min(threshold,len(x))))
Tweet.sample(10,random_state = 0)

In [None]:
vis = Tweet["User"].value_counts()
print(vis.describe())
print(Tweet.shape)

In [None]:
def stack_features(data):
    w1 = word1_v.transform(data)
    w2 = word2_v.transform(data)
    w3 = word3_v.transform(data)
    c1 = char_v.transform(data)
    stack = hstack([w1,w2,w3,c1])
    stack = preprocessing.normalize(stack)
    return stack

In [None]:
word1_v = TfidfVectorizer(ngram_range = (1,1), min_df = 3, sublinear_tf = True, max_df = .75, tokenizer= text_process, max_features =20000)
word2_v = TfidfVectorizer(ngram_range = (2,2), min_df = 3, sublinear_tf = True, max_df = .75, tokenizer= text_process, max_features =20000)
word3_v = TfidfVectorizer(ngram_range = (3,3), min_df = 3, sublinear_tf = True, max_df = .75, tokenizer= text_process, max_features =20000)
char_v = TfidfVectorizer(analyzer='char', ngram_range=(2,4), sublinear_tf = True, max_df = .75, tokenizer= text_process, max_features = 20000)
word1_v.fit(Tweet.Tweet)
word2_v.fit(Tweet.Tweet)
word3_v.fit(Tweet.Tweet)
char_v.fit(Tweet.Tweet)

In [None]:
X = Tweet.Tweet
y = Tweet.User
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0,test_size = .3)
X_train = stack_features(X_train)
X_test = stack_features(X_test)
print(X_train.shape)

# Various O vs R classifiers Test

In [None]:
from sklearn.linear_model import SGDClassifier
lr = LogisticRegression()
nb = MultinomialNB()
svm = LinearSVC(max_iter=5000)
rf = RandomForestClassifier(n_estimators=100, max_depth=20, max_features=5000,n_jobs=-1)

def test_model(model):
    if model == 'LR':
        fit = lr.fit(X_train, y_train)
        algorithm = 'Logistic Regression'
    if model == 'MNB':
        fit = nb.fit(X_train, y_train)
        algorithm = 'Multinomial Naive Bayes'
    if model == 'SVC':
        fit = svm.fit(X_train, y_train)
        algorithm = 'Linear SVC'   
    if model == 'RF':
        fit = rf.fit(X_train, y_train)
        algorithm = 'Random Forest'
    print(algorithm)
    return fit

In [None]:
%%time
model = test_model('SVC')

In [None]:
preds = model.predict(X_test)

In [None]:
print("-- One Vs Rest --")
# print("Weighted F1: {0}".format(metrics.f1_score(y_test, preds, average=scoring_average)))
# print("Precision: {0}".format(metrics.precision_score(y_test, preds, average=scoring_average)))
# print("Recall: {0}".format(metrics.recall_score(y_test, preds, average=scoring_average)))
print('Accuracy: ', metrics.accuracy_score(y_test, preds))

# Sampling

In [None]:
from imblearn.combine import SMOTEENN
from imblearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier

In [None]:
pipe = Pipeline([('sampl', SMOTEENN()), 
                 ('clf', MultinomialNB())])

In [None]:
%%time
ovr = OneVsRestClassifier(pipe)
ovr.fit(X_train, y_train)

In [None]:
%%time
preds = ovr.predict(X_test)

In [None]:
print("-- One Vs Rest --")
print("Weighted F1: {0}".format(metrics.f1_score(y_test, preds, average=scoring_average)))
print("Precision: {0}".format(metrics.precision_score(y_test, preds, average=scoring_average)))
print("Recall: {0}".format(metrics.recall_score(y_test, preds, average=scoring_average)))
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))

# Grid Search CV

In [None]:
from skdist.distribute.search import DistGridSearchCV
from pyspark.sql import SparkSession 
from skdist.distribute.multiclass import DistOneVsRestClassifier

In [None]:
sc.stop()

In [None]:
spark = SparkSession.builder \
    .config("spark.executor.memory", "70g") \
     .config("spark.driver.memory", "50g") \
     .config("spark.memory.offHeap.enabled",true) \
     .config("spark.memory.offHeap.size","16g") \
    .getOrCreate()

sc = spark.sparkContext 

In [None]:
sc._conf.getAll()

In [None]:
param_grid = {
    "C": [0.01, 0.1, 1.0, 10.0],  
    "max_iter" : [1000,3000,5000]
    }

In [None]:
%%time
model = DistOneVsRestClassifier(LinearSVC(max_iter = 1000), sc)
model.fit(X_train,y_train)

In [None]:
%%time
preds = model.predict(X_test)

In [None]:
print("-- One Vs Rest --")
print("Weighted F1: {0}".format(metrics.f1_score(y_test, preds, average=scoring_average)))
print("Precision: {0}".format(metrics.precision_score(y_test, preds, average=scoring_average)))
print("Recall: {0}".format(metrics.recall_score(y_test, preds, average=scoring_average)))
print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
# print(pickle.loads(pickle.dumps(model)))

# Submission Code

In [None]:
def prepare_test_data():
    file1 = "data/test_tweets_unlabeled.txt"
    with open(file1, 'r') as data:
        temp = [line for line in data]    
    unlabel = pd.DataFrame(temp,columns = ["Tweet"])
    unlabel = clean_df(unlabel)
    unlabel = stack_features(unlabel)
    return unlabel
    
def submission_file(data):
    import csv
    with open('predicted.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(['Id','Predicted'])
        for count,predicted in enumerate(data):
            writer.writerow([count+1,predicted])

In [None]:
unlabel_data = prepare_test_data() 
unlabel_pred = model.predict(unlabel_data)
submission_file(unlabel_pred)