In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3
from sqlalchemy import create_engine

import os
import csv

In [2]:
loc = ''#'/data/SO_predict_DATA'

In [30]:
def create_connection(db_file):
    """ create a database connection to the SQLite database
        specified by db_file
    :param db_file: database file
    :return: Connection object or None
    """
    try:
        conn = sqlite3.connect(loc+db_file)
        return conn
    except sqlite3.Error as e:
        print(e)
 
    return None

In [31]:
con = create_connection('random_train.db')

In [32]:
if con is not None:
    tag_data = pd.read_sql('''SELECT Tags FROM data''',con)
    print(tag_data.shape)

(10000, 1)


In [33]:
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
vectorizer = CountVectorizer(tokenizer = lambda x: x.split())
tag_dtm = vectorizer.fit_transform(tag_data['Tags'])

In [35]:
print("Number of data points :", tag_dtm.shape[0])
print("Number of unique tags :", tag_dtm.shape[1])

('Number of data points :', 10000)
('Number of unique tags :', 6205)


In [36]:
tag_names = vectorizer.get_feature_names()

In [37]:
freqs = tag_dtm.sum(axis=0).A1    ##.A1 Return `self` as a flattened `ndarray`.
result = dict(zip(tag_names, freqs))

In [38]:
#Saving this dictionary to csv files.
if not os.path.isfile('1_tag_counts.csv'):
    with open('1_tag_counts.csv', 'w') as csv_file:
        writer = csv.writer(csv_file)
        writer.writerow(['Tags', 'Counts'])
        for key, value in result.items():
            writer.writerow([key, value])
            
tag_df = pd.read_csv("1_tag_counts.csv")
tag_df.head()

Unnamed: 0,Tags,Counts
0,screen-resolution,2
1,userscripts,1
2,opengl,24
3,prefix,1
4,dropbox-api,2


In [39]:
tag_df_sorted = tag_df.sort_values(['Counts'], ascending=False)
tag_counts = tag_df_sorted['Counts'].values

In [40]:
tag_df_sorted.head()

Unnamed: 0,Tags,Counts
1440,c#,745
5681,java,728
1469,php,653
1046,javascript,613
1462,android,567


In [41]:
print(tag_df_sorted['Counts'].describe())

count    6205.000000
mean        4.679291
std        24.583264
min         1.000000
25%         1.000000
50%         1.000000
75%         3.000000
max       745.000000
Name: Counts, dtype: float64


In [42]:
len(tag_df_sorted[tag_df_sorted['Counts'] < 3])

4524

In [43]:
tag_to_remove = tag_df_sorted[tag_df_sorted['Counts'] < 3]

In [44]:
type(tag_to_remove)

pandas.core.frame.DataFrame

In [45]:
tag_to_remove.to_csv('tag_to_remove.csv',index=None)

In [50]:
tag_df_sorted[tag_df_sorted['Tags']=='web-services']

Unnamed: 0,Tags,Counts
5650,web-services,58
