In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

import os
import csv

In [30]:
loc = '/data/SO_predict_DATA'
db_name = '/train_no_dup.db'

In [31]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
 
    return None

In [32]:
con = create_connection(db_name)

In [33]:
if con is not None:
    tag_data = pd.read_sql('''SELECT Tags FROM no_dup_train''',con)
    print(tag_data.shape)

(4206308, 1)


In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split())
tag_dtm = vectorizer.fit_transform(tag_data['Tags'])

In [36]:
print("Number of data points :", tag_dtm.shape[0])
print("Number of unique tags :", tag_dtm.shape[1])

('Number of data points :', 4206308)
('Number of unique tags :', 42048)


In [37]:
tag_names = vectorizer.get_feature_names()

In [38]:
freqs = tag_dtm.sum(axis=0).A1    ##.A1 Return `self` as a flattened `ndarray`.
result = dict(zip(tag_names, freqs))

In [40]:
#Saving this dictionary to csv files.
if not os.path.isfile('full_tag_counts.csv'):
    with open('full_tag_counts.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Tags', 'Counts'])
        for key, value in result.items():
            writer.writerow([key, value])
            
tag_df = pd.read_csv("full_tag_counts.csv")
tag_df.head()

Unnamed: 0,Tags,Counts
0,mdbg,14
1,fouc,23
2,mdraid,4
3,screen-resolution,477
4,mms-streaming,10


In [41]:
tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)
tag_counts = tag_df_sorted['Counts'].values

In [42]:
tag_df_sorted.head(20)

Unnamed: 0,Tags,Counts
12780,c#,331505
30485,java,299414
12835,php,284103
22551,javascript,265423
23337,android,235436
4155,jquery,221533
3573,c++,143936
12555,python,134137
29491,iphone,128681
2216,asp.net,125651


In [43]:
print(tag_df_sorted['Counts'].describe())

count     42048.000000
mean        290.048231
std        4078.252789
min           1.000000
25%           5.000000
50%          17.000000
75%          66.000000
max      331505.000000
Name: Counts, dtype: float64


In [51]:
42048 - (len(tag_df_sorted[tag_df_sorted['Counts'] < 25]))

17702

In [59]:
(float)(len(tag_df_sorted[tag_df_sorted['Counts'] > 25]))/len(tag_df_sorted)

0.412718797564688

In [60]:
tag_to_remove = tag_df_sorted[tag_df_sorted['Counts'] < 25]

In [61]:
type(tag_to_remove)

pandas.core.frame.DataFrame

In [62]:
tag_to_remove.to_csv('full_tag_remain.csv',index=None)

In [64]:
tag_df_sorted[tag_df_sorted['Tags']=='pyscard']

Unnamed: 0,Tags,Counts
5546,pyscard,1
