In [None]:
import pandas as pd 
import numpy as np 
import scipy as sp 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.externals import joblib
from nltk.corpus import stopwords
from scipy.sparse import hstack
from collections import Counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
import re
from sklearn.decomposition import TruncatedSVD
from utils1 import *
import string
from pattern.en import suggest
import snowballstemmer
import nltk
from sklearn import preprocessing
from nltk.corpus import stopwords
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
nltk.download('tagsets')

In [None]:
file = "data/train_tweets.txt"
temp = []
with open(file, 'r') as data:
    for line in data:
        row = []
        line = line.replace('\t'," ")
        elem = line.strip().split(" ")
        row.append(elem[0])
        row.append(" ".join(elem[1:]))
        temp.append(row) 

## EDA and data manipulation

    Remove and play with features depending on requirement.

In [None]:
from collections import Counter
from itertools import chain
from nltk import word_tokenize, pos_tag

def check(x):
    try:
        x = Counter(list(zip(*x))[1])
    except Exception:
        x = Counter({})
        
    return x
    
def add_pos(tw):
    tok_and_tag = lambda x: pos_tag(word_tokenize(str.lower(x)))
    tw['tagged_sent'] = tw['Tweet_clean'].apply(tok_and_tag)
    possible_tags = sorted(set(list(zip(*chain(*tw['tagged_sent'])))[1]))
    def add_pos_with_zero_counts(counter, keys_to_add):
        for k in keys_to_add:
            counter[k] = counter.get(k, 0)
        return counter
    
    tw['pos_counts'] = tw['tagged_sent'].apply(lambda x: check(x))
    tw['sent_vector'] = tw['tagged_sent'].apply(lambda x:
    [count for tag, count in sorted(
        add_pos_with_zero_counts(
            check(x), 
                    possible_tags).most_common())])
    df2 = pd.DataFrame(tw['sent_vector'].tolist())
    df2.columns = possible_tags
    tw = tw.assign(**df2)
    tw = tw.drop(['tagged_sent','pos_counts','sent_vector'], axis=1)
    df2 = 0
    return tw

In [None]:
min_no_tweets = 1
threshold = 300

In [None]:
from nltk.tokenize import TweetTokenizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
STOPWORDS = set(stopwords.words('english'))
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer()
def lemmatize(word):
    lemma = lemmatizer.lemmatize(word,'v')
    if lemma == word:
        lemma = lemmatizer.lemmatize(word,'n')
    return lemma

def text_process(text):
    
    text = str.lower(text)
    tk = TweetTokenizer(strip_handles=True, reduce_len=True, preserve_case = False) 
    text = tk.tokenize(text)
    text = [word for word in text if word not in STOPWORDS]
    
    text = ' '.join(lemmatize(word) for word in text)
#     text = ' '.join(PorterStemmer.stem(word) for word in text)
#     text = ' '.join(word for word in text)
    
    return text

In [None]:
from urllib.parse import urlparse
import html
import unidecode
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

def ret_sen(x):
    
    try:
        ret = pd.Series(analyzer.polarity_scores(x))
    except Exception:
        ret = pd.Series({'neg':0.0,'neu': 1.0,'pos': 0.0,'compound': 0.0})
    return ret

def preprocess(tw):
    tw = tw.drop_duplicates()
    tw=tw[~tw['Tweet_clean'].str.startswith('RT')]
    tw["Tweet_clean"] = tw['Tweet_clean'].apply(lambda x: html.unescape(x))
    tw["Tweet_clean"] = tw['Tweet_clean'].apply(lambda x: unidecode.unidecode(x))
    tw["Tweet_clean"].replace("(\\r|)\\n$", '', regex=True,inplace=True)
    tw['Tweet_clean'].replace("(@[A-Za-z0-9]+)","",regex=True,inplace=True)
    tw["Tweet_clean"].replace(r'http.?://[^\s]+[\s]?','', regex=True,inplace=True)
    tw = tw.reset_index(drop=True)
    
    tw[['sen_neg','sen_neu','sen_pos','sen_com']] = tw['Tweet_clean'].apply(lambda x: ret_sen(x))
    tw = add_pos(tw)
    
    
    return tw

In [None]:
%%time
tw = pd.DataFrame(temp,columns = ["User","Tweet"])
tw["Tweet_clean"] = tw['Tweet']
tw = preprocess(tw)

In [None]:
%%time
tw['Tweet_clean'] = tw['Tweet_clean'].apply(text_process)

In [None]:
tw.sample(10)

In [None]:
cnt_user = tw['User'].value_counts()
cnt_user.describe()

# Feature extraction

    Using TF-IDF and without sampling data

In [None]:
def sample_data(tw):
    tw['num_of_words'] = tw["Tweet_clean"].str.split().apply(len)
    tw.drop(tw[tw.num_of_words<1].index, inplace=True)
    tw = tw.drop(tw.columns[-1], axis=1)
    tw = tw.reset_index(drop=True)
    cnt_user = tw['User'].value_counts()
    df = pd.DataFrame(cnt_user)
    top_user = df[df['User'] >= min_no_tweets].index.tolist()
    top_k = tw[tw.User.isin(top_user)]
    data = top_k['User'].value_counts()
    Tweet = top_k.groupby('User',group_keys=False).apply(lambda x: x.sample(n = min(threshold,len(x))))
    Tweet.sample(10)
    tw = Tweet
    return tw

In [None]:
tw = sample_data(tw)

In [None]:
vis = tw["User"].value_counts()
print(vis.describe())
print(tw.shape)
tw.sample(20)

In [None]:
length = tw['Tweet_clean'].str.len()

plt.hist(length, bins=20, label="tweets")
plt.legend()
plt.show()

## TFIDF

In [None]:
max_f = 60000
word1_v = TfidfVectorizer(sublinear_tf = True, ngram_range = (1,4), min_df = 5, token_pattern= r'(?u)[#]*\b\w\w+\b', max_features= max_f)
char_v = TfidfVectorizer(sublinear_tf = True, ngram_range=(2,4), min_df = 5, token_pattern= r'(?u)[#]*\b\w\w+\b', max_features=max_f, analyzer='char',)
spec_char_v = TfidfVectorizer(sublinear_tf = True, token_pattern= r'(?u)(#\w+)|(!+)|(\?+)|(:\))|(:D)|(:o)|(:O)|(\.+)|(:\))', max_features = max_f)
word1_v.fit(tw.Tweet_clean)
char_v.fit(tw.Tweet_clean)
spec_char_v.fit(tw.Tweet)

In [None]:
#Comment depending on feature selection

def stack_features(data):
    w1 = word1_v.transform(data['Tweet_clean'])
    c1 = char_v.transform(data['Tweet_clean'])
    s1 = spec_char_v.transform(data['Tweet'])
    print(w1.shape + c1.shape + s1.shape)
    feat_1 = data[data.columns[3:7]].values
    feat_1 = preprocessing.normalize(feat_1)
    feat_2 = data[data.columns[7:]].values
    feat_2 = preprocessing.normalize(feat_2)
    tf_idf = hstack([w1,c1,s1])
    return tf_idf


# Split train test

In [None]:
X = tw
y = tw.User
print(X.shape)
print(y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,random_state=0,test_size = .25)
X_train = stack_features(X_train)
print(X_train.shape)
X_test = stack_features(X_test)
print(X_test.shape)

# Various O vs R classifiers Test

In [None]:
from sklearn.linear_model import SGDClassifier
lr = LogisticRegression()
# nb = ComplementNB(norm = True)
nb = MultinomialNB()
svm = LinearSVC(max_iter=10000)
rf = RandomForestClassifier(n_estimators=100, max_depth=20, max_features=5000,n_jobs=-1)

def test_model(model,X,y):
    if model == 'LR':
        fit = lr.fit(X, y)
        algorithm = 'Logistic Regression'
    if model == 'MNB':
        fit = nb.fit(X, y)
        algorithm = 'Multinomial Naive Bayes'
    if model == 'SVC':
        fit = svm.fit(X, y)
        algorithm = 'Linear SVC'   
    if model == 'RF':
        fit = rf.fit(X, y)
        algorithm = 'Random Forest'
    print(algorithm)
    return fit

In [None]:
%%time
model = test_model('SVC',X_train,y_train)

In [None]:
preds = model.predict(X_test)

In [None]:
print("-- One Vs Rest --")
# print("Weighted F1: {0}".format(metrics.f1_score(y_test, preds, average=scoring_average)))
# print("Precision: {0}".format(metrics.precision_score(y_test, preds, average=scoring_average)))
# print("Recall: {0}".format(metrics.recall_score(y_test, preds, average=scoring_average)))
print('Accuracy: ', metrics.accuracy_score(y_test, preds))

# Submission Code

In [None]:
def prepare_test_data():
    file1 = "data/test_tweets_unlabeled.txt"
    with open(file1, 'r') as data:
        temp = [line for line in data]    
    unlabel = pd.DataFrame(temp,columns = ["Tweet"])
    unlabel["Tweet_clean"] = unlabel['Tweet']
    unlabel = preprocess(unlabel)
    unlabel["Tweet_clean"] = unlabel["Tweet_clean"].apply(text_process)
    unlabel = stack_features(unlabel)
    return unlabel
    
def submission_file(data):
    import csv
    with open('predicted.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(['Id','Predicted'])
        for count,predicted in enumerate(data):
            writer.writerow([count+1,predicted])

In [None]:
unlabel_data = prepare_test_data() 
unlabel_pred = model.predict(unlabel_data)
submission_file(unlabel_pred)