In [160]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

import os
import csv
from datetime import datetime

In [115]:
loc = ''
db_name = 'random_train.db'

In [129]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
 
    return None

In [130]:
con = create_connection(db_name)

In [131]:
if con is not None:
    tag_data = pd.read_sql('SELECT Tags FROM data',con)
else :
    print('Conn error')

In [132]:
print(tag_data.head())
print('Shape',tag_data.shape)

                                                Tags
0             iphone objective-c ios uiview uibutton
1                    svg internet-explorer-9 raphael
2  validation spring-mvc internationalization cus...
3                             windows java copy text
4                                  javascript jquery
('Shape', (10000, 1))


In [133]:
if con is not None:
    t_data = pd.read_sql('SELECT Title FROM data',con)
else :
    print('Conn error')

In [134]:
print(t_data.head())
print('Shape',t_data.shape)

                                               Title
0       One tap triggering events on multiple views?
1  IE9 text positioning bug when zoomed in with R...
2  Spring MVC custom errors and internationalization
3                How to copy text from Java program?
4  How to scroll to a part of the page using jQuery?
('Shape', (10000, 1))


## Preprocess title Data

In [135]:
from nltk.tokenize import word_tokenize  

# https://stackoverflow.com/questions/35345761/python-re-split-vs-nltk-word-tokenize-and-sent-tokenize

In [136]:
import re

t_data.head()
t_data.Title = t_data.Title.apply(lambda x : x.encode('utf-8'))
t_data.Title = t_data.Title.apply(lambda x : str.lower(x))
t_data.Title = t_data.Title.apply(lambda x : re.sub(r'[^A-Za-z]+',' ',x))
#title_data = t_data


In [137]:
def fn(sen):
    return ' '.join(w for w in word_tokenize(sen) if w not in stop_words)

t_data.Title = t_data.Title.apply(lambda x : fn(x))#' '.join for w in word_tokenize(x) if w not in stop_words)


In [138]:
t_data.Title[:5]

0         one tap triggering events multiple views
1       ie text positioning bug zoomed raphael svg
2    spring mvc custom errors internationalization
3                           copy text java program
4                    scroll part page using jquery
Name: Title, dtype: object

In [46]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [85]:
stop_words = set(stopwords.words('english'))
stop_words

{u'a',
 u'about',
 u'above',
 u'after',
 u'again',
 u'against',
 u'ain',
 u'all',
 u'am',
 u'an',
 u'and',
 u'any',
 u'are',
 u'aren',
 u"aren't",
 u'as',
 u'at',
 u'be',
 u'because',
 u'been',
 u'before',
 u'being',
 u'below',
 u'between',
 u'both',
 u'but',
 u'by',
 u'can',
 u'couldn',
 u"couldn't",
 u'd',
 u'did',
 u'didn',
 u"didn't",
 u'do',
 u'does',
 u'doesn',
 u"doesn't",
 u'doing',
 u'don',
 u"don't",
 u'down',
 u'during',
 u'each',
 u'few',
 u'for',
 u'from',
 u'further',
 u'had',
 u'hadn',
 u"hadn't",
 u'has',
 u'hasn',
 u"hasn't",
 u'have',
 u'haven',
 u"haven't",
 u'having',
 u'he',
 u'her',
 u'here',
 u'hers',
 u'herself',
 u'him',
 u'himself',
 u'his',
 u'how',
 u'i',
 u'if',
 u'in',
 u'into',
 u'is',
 u'isn',
 u"isn't",
 u'it',
 u"it's",
 u'its',
 u'itself',
 u'just',
 u'll',
 u'm',
 u'ma',
 u'me',
 u'mightn',
 u"mightn't",
 u'more',
 u'most',
 u'mustn',
 u"mustn't",
 u'my',
 u'myself',
 u'needn',
 u"needn't",
 u'no',
 u'nor',
 u'not',
 u'now',
 u'o',
 u'of',
 u'off',
 

In [148]:
stemmer = SnowballStemmer("english")
stemmer.stem('wives')

u'wive'

In [149]:
title_data = t_data

### --------------------------------------

In [150]:
from sklearn.feature_extraction.text import CountVectorizer

In [151]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')  #Binary BoW

In [152]:
multi_lab_y = vectorizer.fit_transform(tag_data.Tags)

In [153]:
print("Number of data points :", multi_lab_y.shape[0])
print("Number of unique tags :", multi_lab_y.shape[1])

('Number of data points :', 10000)
('Number of unique tags :', 6205)


In [154]:
## choose first n tags (desc order of count)

def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn = multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

<h2>4.2 Split the data into test and train (80:20) </h2>

In [155]:
tot_size = title_data.shape[0]
train_size = int(tot_size * 0.8)
print(tot_size,train_size)

(10000, 8000)


In [156]:
x_train=title_data.head(train_size)
x_test=title_data.tail(tot_size - train_size)

y_train = multi_lab_y[0:train_size,:]
y_test = multi_lab_y[train_size:tot_size,:]

## @ Model Testing

In [208]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import LinearSVC

from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score

In [167]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))

In [168]:
start = datetime.now()
x_train_multilabel = vectorizer.fit_transform(x_train.Title)
x_test_multilabel = vectorizer.transform(x_test.Title)
print("Time taken to run this cell :", datetime.now() - start)

('Time taken to run this cell :', datetime.timedelta(0, 0, 523872))


In [169]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

('Dimensions of train data X:', (8000, 69639), 'Y :', (8000, 6205))
('Dimensions of test data X:', (2000, 69639), 'Y:', (2000, 6205))


### SGDC with OnevsRest

In [178]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions))
#print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro'))
#print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro'))
#print("hamming loss :",metrics.hamming_loss(y_test,predictions))
#print("Precision recall report :\n",metrics.classification_report(y_test, predictions))


('accuracy :', 0.049500000000000002)


### Logistic with OnevsRest

In [179]:
start = datetime.now()

classifier2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1)
classifier2.fit(x_train_multilabel, y_train)
predictions2 = classifier2.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions2))

('Time taken to run this cell :', datetime.timedelta(0, 89, 9685))
('accuracy :', 0.043499999999999997)


## LDA  -- TIME

In [204]:
y_train3 =y_train.todense()
y_test3 = y_test.todense()

print type(y_train)
print type(x_train_multilabel.todense())

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.matrixlib.defmatrix.matrix'>


In [None]:
start = datetime.now()

#classifier3 = OneVsRestClassifier(LDA())
#classifier3.fit(x_train_multilabel.todense(), y_train)
#predictions3 = classifier3.predict(x_test_multilabel)

#print("Time taken to run this cell :", datetime.now() - start)
#print("accuracy :",metrics.accuracy_score(y_test,predictions3))

## SVC

In [210]:
start = datetime.now()

classifier4 = OneVsRestClassifier(LinearSVC())
classifier4.fit(x_train_multilabel, y_train)
predictions4 = classifier4.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions4))

('Time taken to run this cell :', datetime.timedelta(0, 69, 99169))
('accuracy :', 0.035999999999999997)
