In [4]:
#import warnings
#warnings.filterwarnings("ignore")

import pandas as pd
import os
from sqlalchemy import create_engine
from datetime import datetime
import datetime as dt
import sqlite3

import csv

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ai16/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
loc = '/data/SO_predict_DATA'

In [10]:
loc+'/train.db'

'/data/SO_predict_DATA/train.db'

In [4]:
if not os.path.isfile(loc+'/train.db'):
    start = datetime.now()
    disk_engine = create_engine('sqlite:///train.db')
    #start = dt.datetime.now()
    chunksize = 180000
    j = 0
    index_start = 1
    for df in pd.read_csv(loc+'/Train.csv', names=['Id', 'Title', 'Body', 'Tags'], chunksize=chunksize, iterator=True, encoding='utf-8', ):
        df.index += index_start
        j+=1
        print('{} rows'.format(j*chunksize))
        df.to_sql('data', disk_engine, if_exists='append')
        index_start = df.index[-1] + 1
    print("Time taken :", datetime.now() - start)
else:
    print("Already Exists")

Already Exists


In [9]:
if os.path.isfile(loc+'/train.db'):
    start = datetime.now()
    con = sqlite3.connect(loc+'/train.db')
    num_rows = pd.read_sql_query("""SELECT count(*) FROM data""", con)
    print("Number of rows in the database :","\n",num_rows['count(*)'].values[0])
    con.close()
    print("Time taken to count the number of rows :", datetime.now() - start)
else:
    print("Error !! DB file not found")

Number of rows in the database : 
 6034196
Time taken to count the number of rows : 0:00:00.129280


In [8]:
if os.path.isfile(loc+'/train.db'):
    start = datetime.now()
    con = sqlite3.connect(loc+'/train.db')
    df_no_dup = pd.read_sql_query('SELECT Title, Body, Tags, COUNT(*) as cnt_dup FROM data GROUP BY Title, Body, Tags', con)
    con.close()
    print("Time taken to run this cell :", datetime.now() - start)
else:
    print("Error !! DB file not found")

Time taken to run this cell : 0:01:22.584121


In [10]:
print("number of duplicate questions :", num_rows['count(*)'].values[0]- df_no_dup.shape[0], "(",(1-((df_no_dup.shape[0])/(num_rows['count(*)'].values[0])))*100,"% )")

number of duplicate questions : 1827881 ( 30.292038906260256 % )


In [13]:
df_no_dup.cnt_dup.value_counts()

1    2656284
2    1272336
3     277575
4         90
5         25
6          5
Name: cnt_dup, dtype: int64

In [59]:
len(df_no_dup.cnt_dup[df_no_dup.Tags.isna()])

7

In [14]:
#Creating a new database with no duplicates
if not os.path.isfile(loc+'/train_no_dup.db'):
    disk_dup = create_engine("sqlite:///train_no_dup.db")
    no_dup = pd.DataFrame(df_no_dup, columns=['Title', 'Body', 'Tags'])
    no_dup.to_sql('no_dup_train',disk_dup)

In [143]:
#This method seems more appropriate to work with this much data.
#creating the connection with database file.
if os.path.isfile(loc+'/train_no_dup.db'):
    start = datetime.now()
    con = sqlite3.connect(loc+'/train_no_dup.db')
    tag_data = pd.read_sql_query("""SELECT Tags FROM no_dup_train""", con)
    #Always remember to close the database
    con.close()

    # Let's now drop unwanted column.
    #tag_data.drop(tag_data.index[0], inplace=True)
    #Printing first 5 columns from our data frame
    print(tag_data.head())
    print("Time taken :", datetime.now() - start)
else:
    print("Error !! DB file not found")

                                  Tags
0                                c++ c
1          c# silverlight data-binding
2  c# silverlight data-binding columns
3                             jsp jstl
4                            java jdbc
Time taken : 0:00:09.445131


In [144]:
### Entries with no Tags

len(tag_data[tag_data['Tags'].isnull()])

7

In [145]:
tag_data.shape

(4206315, 1)

### Delete rows with tag is Null

In [114]:
con.close()

In [147]:
disk_engine = create_engine('sqlite:///train_no_dup.db')
disk_engine.execute("""DELETE FROM no_dup_train WHERE Tags IS NULL""")

<sqlalchemy.engine.result.ResultProxy at 0x7f0c19100cc0>

In [16]:
if os.path.isfile(loc+'/train_no_dup.db'):
    start = datetime.now()
    con = sqlite3.connect(loc+'/train_no_dup.db')
    
    #con.close()
    tag_data = pd.read_sql_query("""SELECT Tags FROM no_dup_train""", con)
    con.close()
    #Always remember to close the database
    con.close()
    print("Time taken :", datetime.now() - start)
else:
    print("Error !! DB file not found")

('Time taken :', datetime.timedelta(0, 10, 509544))


In [17]:
len(tag_data[tag_data['Tags'].isnull()])

0

In [18]:
tag_data.shape

(4206308, 1)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
#by default 'split()' will tokenize each tag using space
vectorizer = CountVectorizer(tokenizer = lambda x: x.split())

# fit_transform() does two functions: First, it fits the model
# and learns the vocabulary; second, it transforms our training data
# into feature vectors. The input to fit_transform should be a list of strings.
tag_dtm = vectorizer.fit_transform(tag_data['Tags'])

In [9]:
print("Number of data points :", tag_dtm.shape[0])
print("Number of unique tags :", tag_dtm.shape[1])

('Number of data points :', 4206308)
('Number of unique tags :', 42048)


In [10]:
type(tag_dtm)

scipy.sparse.csr.csr_matrix

In [11]:
#'get_feature_name()' gives us the vocabulary.
tags = vectorizer.get_feature_names()
#Lets look at the tags we have.
print("Some of the tags we have :", tags[:10])

('Some of the tags we have :', [u'.a', u'.app', u'.asp.net-mvc', u'.aspxauth', u'.bash-profile', u'.class-file', u'.cs-file', u'.doc', u'.drv', u'.ds-store'])


In [12]:
# https://stackoverflow.com/questions/15115765/how-to-access-sparse-matrix-elements
#Lets now store the document term matrix in a dictionary.
freqs = tag_dtm.sum(axis=0).A1
result = dict(zip(tags, freqs))

In [9]:
#Saving this dictionary to csv files.
if not os.path.isfile('tag_counts_dict_dtm.csv'):
    with open('tag_counts_dict_dtm.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Tags', 'Counts'])
        for key, value in result.items():
            writer.writerow([key, value])
            
tag_df = pd.read_csv("tag_counts_dict_dtm.csv")
tag_df.head()

Unnamed: 0,Tags,Counts
0,mdbg,14
1,fouc,23
2,mdraid,4
3,screen-resolution,477
4,mms-streaming,10


## Cleaning and preprocessing

In [7]:
#http://www.sqlitetutorial.net/sqlite-python/create-tables/
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as er:
        print(e)
 
    return None

def create_table(conn, create_table_sql):
    """ create a table from the create_table_sql statement
    :param conn: Connection object
    :param create_table_sql: a CREATE TABLE statement
    :return:
    """
    try:
        c = conn.cursor()
        c.execute(create_table_sql)
    except Error as e:
        print(e)
        
def checkTableExists(dbcon):
    cursr = dbcon.cursor()
    str = "select name from sqlite_master where type='table'"
    table_names = cursr.execute(str)
    print("Tables in the databse:")
    tables =table_names.fetchall() 
    print(tables[0][0])
    return(len(tables))

def create_database_table(database, query):
    conn = create_connection(database)
    if conn is not None:
        create_table(conn, query)
        checkTableExists(conn)
    else:
        print("Error! cannot create the database connection.")
    conn.close()

sql_create_table = """CREATE TABLE IF NOT EXISTS QuestionsProcessed (question text NOT NULL, code text, tags text, words_pre integer, words_post integer, is_code integer);"""
create_database_table("Processed.db", sql_create_table)

Tables in the databse:
QuestionsProcessed


In [8]:
no_dup_con = create_connection('/train_no_dup.db')
checkTableExists(no_dup_con)

Tables in the databse:
no_dup_train


1

In [14]:
if no_dup_con  is not None:
    temp_d = pd.read_sql_query("""SELECT * FROM no_dup_train LIMIT 1,3""", no_dup_con)
    print(type(temp_d))

<class 'pandas.core.frame.DataFrame'>


In [17]:
temp_d.columns

Index([u'index', u'Title', u'Body', u'Tags'], dtype='object')

In [21]:
t_bo = temp_d.iloc[0]['Body']
t_bo

u"<p>I should do binding for datagrid dynamically at code. I wrote the code as below. When I debug this code block, it seems that it does bindings correctly, but grid comes with no columns on form.</p>\n\n<pre><code>MyClass myInstance = new MyClass();\ndataGridObject = new DataGrid();\ndataGridObject.Width = 200;\ndataGridObject.Height = 200;\nbinding = new Binding();\nbinding.Source = myInstance;\nforeach (PropertyInfo prop in myInstance.GetType().GetProperties())\n{\n    binding.Path = new PropertyPath(prop.Name);\n    DataGridTextColumn column = new DataGridTextColumn();\n    column.Header = prop.Name;\n    column.Binding = new Binding(prop.Name);\n    dataGridObject.Columns.Add(column);\n}\n\ndataGridObject.ItemSource = myInstanceList;\n</code></pre>\n\n<p>Why doesn't come grid with columns, although I did necessary bindings?\nThanks for the replies in advance..</p>\n"

In [22]:
print(t_bo)

<p>I should do binding for datagrid dynamically at code. I wrote the code as below. When I debug this code block, it seems that it does bindings correctly, but grid comes with no columns on form.</p>

<pre><code>MyClass myInstance = new MyClass();
dataGridObject = new DataGrid();
dataGridObject.Width = 200;
dataGridObject.Height = 200;
binding = new Binding();
binding.Source = myInstance;
foreach (PropertyInfo prop in myInstance.GetType().GetProperties())
{
    binding.Path = new PropertyPath(prop.Name);
    DataGridTextColumn column = new DataGridTextColumn();
    column.Header = prop.Name;
    column.Binding = new Binding(prop.Name);
    dataGridObject.Columns.Add(column);
}

dataGridObject.ItemSource = myInstanceList;
</code></pre>

<p>Why doesn't come grid with columns, although I did necessary bindings?
Thanks for the replies in advance..</p>



In [25]:
import re

def striphtml(data):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', str(data))
    return cleantext

In [28]:
c = striphtml(t_bo)
c

" I should do binding for datagrid dynamically at code. I wrote the code as below. When I debug this code block, it seems that it does bindings correctly, but grid comes with no columns on form. \n\n  MyClass myInstance = new MyClass();\ndataGridObject = new DataGrid();\ndataGridObject.Width = 200;\ndataGridObject.Height = 200;\nbinding = new Binding();\nbinding.Source = myInstance;\nforeach (PropertyInfo prop in myInstance.GetType().GetProperties())\n{\n    binding.Path = new PropertyPath(prop.Name);\n    DataGridTextColumn column = new DataGridTextColumn();\n    column.Header = prop.Name;\n    column.Binding = new Binding(prop.Name);\n    dataGridObject.Columns.Add(column);\n}\n\ndataGridObject.ItemSource = myInstanceList;\n  \n\n Why doesn't come grid with columns, although I did necessary bindings?\nThanks for the replies in advance.. \n"

In [26]:
from nltk.corpus import stopwords

In [27]:
stopwords.words('english')

[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u"you're",
 u"you've",
 u"you'll",
 u"you'd",
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u"she's",
 u'her',
 u'hers',
 u'herself',
 u'it',
 u"it's",
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u"that'll",
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'th

In [28]:
from nltk.stem.snowball import SnowballStemmer

In [29]:
stemmer = SnowballStemmer("english")
stop_words = set(stopwords.words('english'))

In [37]:
start = datetime.now()
read_db = 'train_no_dup.db'
write_db = 'Processed.db'
if os.path.isfile(read_db):
    conn_r = create_connection(read_db)
    if conn_r is not None:
        reader =conn_r.cursor()
        reader.execute("SELECT Title, Body, Tags From no_dup_train ORDER BY RANDOM() LIMIT 1000000;")

if os.path.isfile(write_db):
    conn_w = create_connection(write_db)
    if conn_w is not None:
        tables = checkTableExists(conn_w)
        writer =conn_w.cursor()
        if tables != 0:
            writer.execute("DELETE FROM QuestionsProcessed WHERE 1")
            print("Cleared All the rows")
print("Time taken to run this cell :", datetime.now() - start)

{1, 2, 3}

In [34]:
stemmer.stem('w')

u'wive'