In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

import os
import csv
from datetime import datetime

In [2]:
loc = ''
db_name = 'random_train.db'

In [3]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
 
    return None

In [4]:
con = create_connection(db_name)

In [5]:
if con is not None:
    tag_data = pd.read_sql('SELECT Tags FROM data',con)
else :
    print('Conn error')

In [6]:
print(tag_data.head())
print('Shape',tag_data.shape)

                                                Tags
0             iphone objective-c ios uiview uibutton
1                    svg internet-explorer-9 raphael
2  validation spring-mvc internationalization cus...
3                             windows java copy text
4                                  javascript jquery
Shape (10000, 1)


In [7]:
if con is not None:
    t_data = pd.read_sql('SELECT Title FROM data',con)
else :
    print('Conn error')


In [8]:
print(t_data.head())
print('Shape',t_data.shape)

                                               Title
0       One tap triggering events on multiple views?
1  IE9 text positioning bug when zoomed in with R...
2  Spring MVC custom errors and internationalization
3                How to copy text from Java program?
4  How to scroll to a part of the page using jQuery?
Shape (10000, 1)


## Preprocess title Data

In [9]:
from nltk.tokenize import word_tokenize  
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
# https://stackoverflow.com/questions/35345761/python-re-split-vs-nltk-word-tokenize-and-sent-tokenize

In [10]:
import re

#t_data.head()
t_data.Title = t_data.Title.apply(lambda x : x.encode('utf-8'))
t_data.Title = t_data.Title.apply(lambda x : str.lower(x))
t_data.Title = t_data.Title.apply(lambda x : re.sub(r'[^A-Za-z0-9#+.\-]+',' ',x))
#title_data = t_data


TypeError: descriptor 'lower' requires a 'str' object but received a 'bytes'

In [11]:
def fn(sen):
    return ' '.join(w for w in word_tokenize(sen) if w not in stop_words)

t_data.Title = t_data.Title.apply(lambda x : fn(x))#' '.join for w in word_tokenize(x) if w not in stop_words)


TypeError: cannot use a string pattern on a bytes-like object

In [12]:
t_data.Title[:5]

0      b'One tap triggering events on multiple views?'
1    b'IE9 text positioning bug when zoomed in with...
2    b'Spring MVC custom errors and internationaliz...
3               b'How to copy text from Java program?'
4    b'How to scroll to a part of the page using jQ...
Name: Title, dtype: object

In [13]:
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer


In [14]:
stemmer = SnowballStemmer("english")
stemmer.stem('wives')

'wive'

In [15]:
title_data = t_data

### ----- Body

In [16]:
if con is not None:
    b_data = pd.read_sql('SELECT Body FROM data',con)
else :
    print('Conn error')

In [17]:
print(b_data[:5])
print('Shape',b_data.shape)

                                                Body
0  <p>In my program I have placed a <code>UIButto...
1  <p>I'm not sure if this is a bug with Raphael ...
2  <p>In my web application, I handle errors with...
3  <p>I am using a Java program called <a href="h...
4  <p>My code scrolls the user to the bottom of t...
Shape (10000, 1)


In [18]:
b_data.Body = b_data.Body.apply(lambda x : re.sub(r'<code>(.*?)</code>',' ',x))
b_data.Body = b_data.Body.apply(lambda x : re.sub(r'<.*?>',' ',x))
b_data.Body = b_data.Body.apply(lambda x : re.sub(r'[^A-Za-z]+',' ',x))
b_data.Body = b_data.Body.apply(lambda x : x.encode('utf-8'))
b_data.Body = b_data.Body.apply(lambda x : str.lower(str(x)))

b_data.Body = b_data.Body.apply(lambda x : fn(x))

In [19]:
b_data[:5]

Unnamed: 0,Body
0,b ' program placed subview set button tapped e...
1,b ' sure bug raphael svg anyone fix workaround...
2,b ' web application handle errors annotations ...
3,b ' using java program called jdownloader v wi...
4,b ' code scrolls user bottom page var elem bod...


In [20]:
title_data = b_data

### --------------------------------------

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [22]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split(), binary='true')  #Binary BoW

In [23]:
multi_lab_y = vectorizer.fit_transform(tag_data.Tags)

In [24]:
print("Number of data points :", multi_lab_y.shape[0])
print("Number of unique tags :", multi_lab_y.shape[1])

Number of data points : 10000
Number of unique tags : 6205


In [25]:
## choose first n tags (desc order of count)

def tags_to_choose(n):
    t = multilabel_y.sum(axis=0).tolist()[0]
    sorted_tags_i = sorted(range(len(t)), key=lambda i: t[i], reverse=True)
    multilabel_yn = multilabel_y[:,sorted_tags_i[:n]]
    return multilabel_yn

<h2>4.2 Split the data into test and train (80:20) </h2>

In [26]:
tot_size = title_data.shape[0]
train_size = int(tot_size * 0.8)
print(tot_size,train_size)

10000 8000


In [27]:
x_train=title_data.head(train_size)
x_test=title_data.tail(tot_size - train_size)

y_train = multi_lab_y[0:train_size,:]
y_test = multi_lab_y[train_size:tot_size,:]

## @ Model Testing

In [28]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.svm import LinearSVC

from sklearn import metrics
from sklearn.metrics import f1_score,precision_score,recall_score

In [29]:
vectorizer = TfidfVectorizer(min_df=0.00009, max_features=200000, smooth_idf=True, norm="l2", \
                             tokenizer = lambda x: x.split(), sublinear_tf=False, ngram_range=(1,3))

In [30]:
start = datetime.now()
x_train_multilabel = vectorizer.fit_transform(x_train.Body)     ## replace column name appropiately
x_test_multilabel = vectorizer.transform(x_test.Body)
print("Time taken to run this cell :", datetime.now() - start)

Time taken to run this cell : 0:00:06.814395


In [31]:
print("Dimensions of train data X:",x_train_multilabel.shape, "Y :",y_train.shape)
print("Dimensions of test data X:",x_test_multilabel.shape,"Y:",y_test.shape)

Dimensions of train data X: (8000, 200000) Y : (8000, 6205)
Dimensions of test data X: (2000, 200000) Y: (2000, 6205)


### SGDC with OnevsRest

In [32]:
start = datetime.now()

classifier = OneVsRestClassifier(SGDClassifier(loss='log', alpha=0.00001, penalty='l1'), n_jobs=-1)
classifier.fit(x_train_multilabel, y_train)
predictions = classifier.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions))
#print("macro f1 score :",metrics.f1_score(y_test, predictions, average = 'macro'))
#print("micro f1 scoore :",metrics.f1_score(y_test, predictions, average = 'micro'))
#print("hamming loss :",metrics.hamming_loss(y_test,predictions))
#print("Precision recall report :\n",metrics.classification_report(y_test, predictions))


KeyboardInterrupt: 

### Logistic with OnevsRest

In [None]:
start = datetime.now()

classifier2 = OneVsRestClassifier(LogisticRegression(penalty='l1'), n_jobs=-1)
classifier2.fit(x_train_multilabel, y_train)
predictions2 = classifier2.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions2))

## LDA  -- TIME

In [34]:
y_train3 =y_train.todense()
y_test3 = y_test.todense()

print type(y_train)
print type(x_train_multilabel.todense())

SyntaxError: invalid syntax (<ipython-input-34-6b6343671de5>, line 4)

In [None]:
start = datetime.now()

classifier3 = OneVsRestClassifier(LDA(), n_jobs=-1)
classifier3.fit(x_train_multilabel.todense(), y_train)
predictions3 = classifier3.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions3))

## SVC

In [33]:
start = datetime.now()

classifier4 = OneVsRestClassifier(LinearSVC(), n_jobs=-1)
classifier4.fit(x_train_multilabel, y_train)
predictions4 = classifier4.predict(x_test_multilabel)

print("Time taken to run this cell :", datetime.now() - start)
print("accuracy :",metrics.accuracy_score(y_test,predictions4))

Time taken to run this cell : 0:00:47.556870
accuracy : 0.0165
