In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

import os
import csv

In [2]:
loc = '/data1/SO_predict_DATA'
db_name = '/train_no_dup.db' #'random_train.db'    #data  no_dup_train

In [6]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
 
    return None

In [7]:
con = create_connection(db_name)

In [8]:
if con is not None:
    tag_data = pd.read_sql('''SELECT Tags FROM no_dup_train''',con)
    print(tag_data.shape)

(4206308, 1)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split())
tag_dtm = vectorizer.fit_transform(tag_data['Tags'])

In [11]:
print("Number of data points :", tag_dtm.shape[0])
print("Number of unique tags :", tag_dtm.shape[1])

Number of data points : 4206308
Number of unique tags : 42048


In [12]:
tag_names = vectorizer.get_feature_names()

In [13]:
freqs = tag_dtm.sum(axis=0).A1    ##.A1 Return `self` as a flattened `ndarray`.
result = dict(zip(tag_names, freqs))

In [15]:
#Saving this dictionary to csv files.
if not os.path.isfile('tag_counts.csv'):
    with open('tag_counts.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Tags', 'Counts'])
        for key, value in result.items():
            writer.writerow([key, value])
            
tag_df = pd.read_csv("tag_counts.csv")
tag_df.head()

Unnamed: 0,Tags,Counts
0,pywhois,7
1,envoy,2
2,cellid,45
3,surf,141
4,bandwidth-throttling,42


In [16]:
tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)
tag_counts = tag_df_sorted['Counts'].values

In [17]:
tag_df_sorted.head(20)

Unnamed: 0,Tags,Counts
23397,c#,331505
25725,java,299414
20987,php,284103
7143,javascript,265423
39024,android,235436
10971,jquery,221533
21125,c++,143936
30354,python,134137
9333,iphone,128681
7003,asp.net,125651


In [18]:
print(tag_df_sorted['Counts'].describe())

count     42048.000000
mean        290.048231
std        4078.252789
min           1.000000
25%           5.000000
50%          17.000000
75%          66.000000
max      331505.000000
Name: Counts, dtype: float64


In [19]:
(len(tag_df_sorted[tag_df_sorted['Counts'] < 25]))


24346

In [20]:
(float)(len(tag_df_sorted[tag_df_sorted['Counts'] > 24]))/len(tag_df_sorted)

0.4209950532724505

In [21]:
tag_to_remove = tag_df_sorted[tag_df_sorted['Counts'] < 25]

In [22]:
type(tag_to_remove)

pandas.core.frame.DataFrame

In [23]:
tag_to_remove.to_csv('tag_to_remove.csv',index=None)

In [24]:
tag_df_sorted[tag_df_sorted['Tags']=='tags']

Unnamed: 0,Tags,Counts
21176,tags,3483


In [27]:
len(tag_df_sorted[tag_df_sorted.Counts == 24])

361

In [152]:
type(tag_dtm)

scipy.sparse.csr.csr_matrix