In [5]:
import pandas as pd 
import numpy as np 
import scipy as sp 
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.naive_bayes import MultinomialNB 
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.externals import joblib

# Text Processing

    Read each sentence from training file and remove the tabs and extract id and tweet seperately. 

In [7]:
file = "data/train_tweets.txt"
temp = []
with open(file, 'r',encoding="Utf-8") as data:
    for line in data:
        row = []
        line = line.replace('\t'," ")
        elem = line.strip().split(" ")
        row.append(elem[0])
        row.append(" ".join(elem[1:]))
        temp.append(row) 

In [8]:
tw = pd.DataFrame(temp,columns = ["User","Tweet"])
tw['Tweet'] = tw['Tweet'].str.lower()
temp = []
elem = []

In [9]:
tw.head()

Unnamed: 0,User,Tweet
0,8746,@handle let's try and catch up live next week!
1,8746,going to watch grey's on the big screen - thur...
2,8746,@handle my pleasure patrick....hope you are well!
3,8746,@handle hi there! been traveling a lot and lot...
4,8746,rt @handle looking to drink clean & go green? ...


In [10]:
tw.tail()

Unnamed: 0,User,Tweet
328927,4357,steelbox demonstrates open video framework wit...
328928,4357,small businesses rely on sage to help them rid...
328929,4357,timesight systems™ announces next-generation p...
328930,4357,diebold makes its leading monitoring solutions...
328931,4357,gvi security solutions to introduce autoip™ vm...


In [11]:
tw.shape

(328932, 2)

### Statistics: can be seen that number of tweets per user on the lower side

In [12]:
cnt_user = tw['User'].value_counts()
cnt_user.describe()

count    9297.000000
mean       35.380445
std        28.146449
min         1.000000
25%        18.000000
50%        32.000000
75%        38.000000
max       284.000000
Name: User, dtype: float64

In [13]:
X = tw.Tweet
y = tw.User
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [14]:
%%time
lr = LogisticRegression()
nb = MultinomialNB()
svm = LinearSVC()
from scipy.sparse import hstack

def test_model(w_vect,c_vect, model):
    w_vect.fit(pd.Series(tw['Tweet']))
    c_vect.fit(pd.Series(tw['Tweet']))         
    X_train_dtm_w = w_vect.transform(X_train)
    X_train_dtm_c = c_vect.transform(X_train)
    X_train_dtm = hstack([X_train_dtm_c, X_train_dtm_w])
    
    print('Features: ', X_train_dtm.shape[1])
    print(f'Rows: {X_train_dtm.shape[0]}')
    X_test_dtm_w = w_vect.transform(X_test)
    X_test_dtm_c = c_vect.transform(X_test)
    X_test_dtm = hstack([X_test_dtm_c, X_test_dtm_w])
    
    if model == 'LR':
        lr.fit(X_train_dtm, y_train)
        y_pred_class = lr.predict(X_test_dtm)
        algorithm = 'Logistic Regression'
    if model == 'MNB':
        nb.fit(X_train_dtm, y_train)
        y_pred_class = nb.predict(X_test_dtm)
        algorithm = 'Multinomial Naive Bayes'
    if model == 'SVC':
        svm.fit(X_train_dtm, y_train)
        y_pred_class = svm.predict(X_test_dtm)
        algorithm = 'Linear SVC'
        
    print('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
    print(algorithm)

Wall time: 0 ns


In [15]:
word_vectorizer = CountVectorizer(stop_words = 'english', ngram_range = (1,2), min_df = 1, max_features = 15000)
char_vectorizer = CountVectorizer(analyzer='char', stop_words='english', ngram_range=(2,4), max_features=50000)

In [None]:
%%time
test_model(word_vectorizer, char_vectorizer, 'LR')

Features:  65000
Rows: 246699


In [None]:
def prepare_test_data(w_vect,c_vect):
    file1 = "data/test_tweets_unlabeled.txt"
    with open(file1, 'r') as data:
        temp = [line for line in data]    
    unlabel = pd.Series(temp)
    unlabel_dtm_w = w_vect.transform(unlabel)
    unlabel_dtm_c = c_vect.transform(unlabel)
    unlabel_dtm = hstack([unlabel_dtm_c, unlabel_dtm_w])
    return unlabel_dtm
    
def submission_file(data):
    import csv
    with open('predicted.csv', 'w') as writeFile:
        writer = csv.writer(writeFile)
        writer.writerow(['Id','Predicted'])
        for count,predicted in enumerate(data):
            writer.writerow([count+1,predicted])

In [None]:
unlabel_dtm = prepare_test_data(word_vectorizer, char_vectorizer) 
unlabel_pred = svm.predict(unlabel_dtm)
submission_file(unlabel_pred)

In [None]:
model_filename = "LR_model_cw_ngram.pkl"
joblib.dump(svm, model_filename)