In [124]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

import os
import csv
from datetime import datetime

In [125]:
from sklearn.externals import joblib ## model persistance

In [126]:
loc = ''
db_name = 'random_train.db'

In [127]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
 
    return None

In [128]:
con = create_connection(db_name)

In [129]:
tag_rm = list( pd.read_csv('tag_to_remove.csv')['Tags'] )
t = [str(i) for i in tag_rm]
li_string = '("'+ '","'.join(t) +'")'

In [130]:
if con is not None:
    tag_data = pd.read_sql('SELECT Tags FROM data WHERE Tags NOT IN '+li_string,con)
else :
    print('Conn error')

In [131]:
print(tag_data.head())
print('Shape',tag_data.shape)

                                                Tags
0             iphone objective-c ios uiview uibutton
1                    svg internet-explorer-9 raphael
2  validation spring-mvc internationalization cus...
3                             windows java copy text
4                                  javascript jquery
('Shape', (9805, 1))


In [132]:
if con is not None:
    t_data = pd.read_sql('SELECT Title FROM data WHERE Tags NOT IN '+li_string,con)
else :
    print('Conn error')


In [133]:

#li_string
if con is not None:
    temp = pd.read_sql('SELECT Tags FROM data WHERE Tags NOT IN '+li_string,con)
else :
    print('Conn error')
#temp    

In [134]:
print(t_data.head())
print('Shape',t_data.shape)

                                               Title
0       One tap triggering events on multiple views?
1  IE9 text positioning bug when zoomed in with R...
2  Spring MVC custom errors and internationalization
3                How to copy text from Java program?
4  How to scroll to a part of the page using jQuery?
('Shape', (9805, 1))


## Preprocess title Data

In [135]:
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# https://stackoverflow.com/questions/35345761/python-re-split-vs-nltk-word-tokenize-and-sent-tokenize

In [136]:
import re

#t_data.head()
t_data.Title = t_data.Title.apply(lambda x : x.encode('utf-8'))
t_data.Title = t_data.Title.apply(lambda x : str.lower(x))
t_data.Title = t_data.Title.apply(lambda x : re.sub(r'[^A-Za-z0-9#+.\-]+',' ',x))
#title_data = t_data


In [137]:
def fn(sen):
    return ' '.join(w for w in word_tokenize(sen) if w not in stop_words)

t_data.Title = t_data.Title.apply(lambda x : fn(x))#' '.join for w in word_tokenize(x) if w not in stop_words)


In [138]:
t_data.Title[:5]

0         one tap triggering events multiple views
1      ie9 text positioning bug zoomed raphael svg
2    spring mvc custom errors internationalization
3                           copy text java program
4                    scroll part page using jquery
Name: Title, dtype: object

In [16]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer


In [148]:
stemmer = SnowballStemmer("english")
stemmer.stem('wives')

u'wive'

In [139]:
title_data = t_data

### ----- Body

In [95]:
if con is not None:
    b_data = pd.read_sql('SELECT Body FROM data  WHERE Tags NOT IN '+li_string,con)
else :
    print('Conn error')

In [96]:
print(b_data[:5])
print('Shape',b_data.shape)

                                                Body
0  <p>In my program I have placed a <code>UIButto...
1  <p>I'm not sure if this is a bug with Raphael ...
2  <p>In my web application, I handle errors with...
3  <p>I am using a Java program called <a href="h...
4  <p>My code scrolls the user to the bottom of t...
('Shape', (9805, 1))


In [97]:
b_data.Body = b_data.Body.apply(lambda x : re.sub(r'<code>(.*?)</code>',' ',x))
b_data.Body = b_data.Body.apply(lambda x : re.sub(r'<.*?>',' ',x))
b_data.Body = b_data.Body.apply(lambda x : re.sub(r'[^A-Za-z]+',' ',x))
b_data.Body = b_data.Body.apply(lambda x : x.encode('utf-8'))
b_data.Body = b_data.Body.apply(lambda x : str.lower(x))

b_data.Body = b_data.Body.apply(lambda x : fn(x))

In [98]:
b_data[:5]

Unnamed: 0,Body
0,program placed subview set button tapped event...
1,sure bug raphael svg anyone fix workaround wou...
2,web application handle errors annotations ever...
3,using java program called jdownloader v wish c...
4,code scrolls user bottom page var elem body ht...


In [99]:
title_data = b_data

## ---------------- title+body = ques

In [38]:
b_data.shape

(9805, 1)

In [39]:
t_data.shape

(9805, 1)

In [40]:
title_data = pd.DataFrame()
title_data['ques'] = t_data.Title + b_data.Body
title_data.shape

(9805, 1)

### --------------------------------------

In [140]:
from sklearn.feature_extraction.text import CountVectorizer

In [141]:
vectorizer_tag = CountVectorizer(tokenizer = lambda x: x.split(), binary='true',min_df = 3)  #Binary BoW

In [142]:
multi_lab_y = vectorizer_tag.fit_transform(tag_data.Tags)

In [143]:
print("Number of data points :", multi_lab_y.shape[0])
print("Number of unique tags :", multi_lab_y.shape[1])

('Number of data points :', 9805)
('Number of unique tags :', 1681)


In [144]:
#vectorizer_tag.get_feature_names()

In [145]:
## choose first n tags (desc order of count)

def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn = multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

<h2>4.2 Split the data into test and train (80:20) </h2>

In [146]:
tot_size = title_data.shape[0]
train_size = int(tot_size * 0.8)
print(tot_size,train_size)

(9805, 7844)


In [147]:
x_train=title_data.head(train_size)
x_test=title_data.tail(tot_size - train_size)

y_train = multi_lab_y[0:train_size,:]
y_test = multi_lab_y[train_size:tot_size,:]

## @ Model Testing

In [148]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import LinearSVC

from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score

In [149]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))

In [151]:
start = datetime.now()
x_train_multilabel = vectorizer.fit_transform(x_train.Title)     ## replace column name appropiately
x_test_multilabel = vectorizer.transform(x_test.Title)
print("Time taken to run this cell :", datetime.now() - start)

('Time taken to run this cell :', datetime.timedelta(0, 0, 489734))


In [152]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

('Dimensions of train data X:', (7844, 70868), 'Y :', (7844, 1681))
('Dimensions of test data X:', (1961, 70868), 'Y:', (1961, 1681))


In [153]:
joblib.dump(x_test_multilabel,'/data/title_xtest_multilabel.pkl')

['/data/title_xtest_multilabel.pkl']

### SGDC with OnevsRest

In [114]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions))
#print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro'))
#print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro'))
#print("hamming loss :",metrics.hamming_loss(y_test,predictions))
#print("Precision recall report :\n",metrics.classification_report(y_test, predictions))


('Time taken to run this cell :', datetime.timedelta(0, 166, 325532))
('accuracy :', 0.079551249362570117)


In [115]:
print "micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro') 

micro f1 scoore : 0.310294356806


In [116]:
joblib.dump(classifier,'/data/body_sgdc.pkl')

['/data/body_sgdc.pkl']

### Logistic with OnevsRest

In [179]:
start = datetime.now()

classifier2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1)
classifier2.fit(x_train_multilabel, y_train)
predictions2 = classifier2.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions2))

('Time taken to run this cell :', datetime.timedelta(0, 89, 9685))
('accuracy :', 0.043499999999999997)


## LDA  -- TIME

In [204]:
y_train3 =y_train.todense()
y_test3 = y_test.todense()

print type(y_train)
print type(x_train_multilabel.todense())

<class 'scipy.sparse.csr.csr_matrix'>
<class 'numpy.matrixlib.defmatrix.matrix'>


In [None]:
start = datetime.now()

#classifier3 = OneVsRestClassifier(LDA())
#classifier3.fit(x_train_multilabel.todense(), y_train)
#predictions3 = classifier3.predict(x_test_multilabel)

#print("Time taken to run this cell :", datetime.now() - start)
#print("accuracy :",metrics.accuracy_score(y_test,predictions3))

## SVC

In [210]:
start = datetime.now()

classifier4 = OneVsRestClassifier(LinearSVC())
classifier4.fit(x_train_multilabel, y_train)
predictions4 = classifier4.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions4))

('Time taken to run this cell :', datetime.timedelta(0, 69, 99169))
('accuracy :', 0.035999999999999997)


In [None]:
joblib.dump()

In [62]:
type(y_test)

scipy.sparse.csr.csr_matrix

In [178]:
actual = vectorizer_tag.inverse_transform(y_test)
actual = [' '.join(i) for i in actual]
actual = pd.Series(actual,name="Actual")
#actual

In [53]:
preds = vectorizer_tag.inverse_transform(predictions)
preds = [' '.join(i) for i in preds]
preds = pd.Series(preds,name="Pred_by_ques")
#preds

In [190]:
resul_df = pd.concat([actual,preds],axis=1)
#resul_df.to_csv("01_resul_rep.csv",index=None)

In [191]:
#rr = pd.read_csv('01_resul_rep.csv')

In [158]:
#rr.append?

In [213]:
rr['pred_by_ques']=preds

In [215]:
rr.to_csv("01_rep_mindf_03.csv",index=None)

In [59]:
r = pd.read_csv('01_rep_mindf_03.csv')

In [178]:
#r

## load and run

In [171]:
title_x_test_multilabel = joblib.load('/data/title_xtest_multilabel.pkl')

title_sgc = joblib.load('/data/title_sgdc.pkl')

pred_title = title_sgc.predict(title_x_test_multilabel)

print("From title accuracy :",metrics.accuracy_score(y_test,pred_title))
print("From title micro f1 scoore :",metrics.f1_score(y_test, pred_title, average = 'micro'))

('From title accuracy :', 0.092809790922998472)
('From title micro f1 scoore :', 0.37282442748091604)


In [172]:
body_x_test_multilabel = joblib.load('/data/body_xtest_multilabel.pkl')

body_sgc = joblib.load('/data/body_sgdc.pkl')

pred_body = body_sgc.predict(body_x_test_multilabel)

print("From body accuracy :",metrics.accuracy_score(y_test,pred_body))
print("From body micro f1 scoore :",metrics.f1_score(y_test, pred_body, average = 'micro'))

('From body accuracy :', 0.079551249362570117)
('From body micro f1 scoore :', 0.31029435680598472)


In [167]:
p_or = pred_title.todense() | pred_body.todense()

In [173]:
print("From OR accuracy :",metrics.accuracy_score(y_test,p_or))
print("From OR micro f1 scoore :",metrics.f1_score(y_test, p_or, average = 'micro'))

('From OR accuracy :', 0.10912799592044875)
('From OR micro f1 scoore :', 0.44913991602329678)


In [176]:
preds = vectorizer_tag.inverse_transform(p_or)
preds = [' '.join(i) for i in preds]
preds = pd.Series(preds,name="Pred_by_OR")
r['pred_by_OR'] = preds
r.to_csv("01_rep_mindf_03.csv",index=None)

In [177]:
r = pd.read_csv('01_rep_mindf_03.csv')
r

Unnamed: 0,Actual,Pred_by_Title,pred_by_body,pred_by_ques,pred_by_OR
0,response list web-services,web-services,web-services,web-services,web-services
1,json-encode ajax php,php,ajax php,ajax php,ajax php
2,upgrade windows-7,windows-7,windows windows-7,windows windows-7,windows windows-7
3,prime-numbers,,,,
4,django database python,django python,django,django,django python
5,newline c,,,,
6,navigation android,,android,android,android
7,android web-services,,android,android,android
8,conflict version-control merge git,git merge,git,git merge,git merge
9,cocos2d-iphone cocos2d objective-c iphone,cocos2d cocos2d-iphone ios iphone,,,cocos2d cocos2d-iphone ios iphone
