# Setup

## Import Libaries

In [1]:
import pandas as pd
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

## Convert db file to Pandas Dataframe

In [2]:
con = sqlite3.connect('./discord scraper/Discord Scrapes/text.db')

In [3]:
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
table_name = cursor.fetchall()[0][0]

In [4]:
db = pd.read_sql_query(f"SELECT name, content, timestamp FROM {table_name}", con)

In [5]:
db

Unnamed: 0,name,content,timestamp
0,!Thiccc Daddy Stqlker 223#6576,reinstall it,2020-10-03T00:30:20.077000+00:00
1,!Thiccc Daddy Stqlker 223#6576,<@!577345786778746911> if it was just that cl...,2020-10-03T00:38:33.786000+00:00
2,M1STERP1CKLES#1609,https://www.reddit.com/r/VALORANT/comments/j08...,2020-10-03T01:39:37.662000+00:00
3,tissueforissue#9056,if i link my amazon prfime account to my riot ...,2020-10-03T11:29:45.012000+00:00
4,!Thiccc Daddy Stqlker 223#6576,no u get the gun buddies and sprays,2020-10-03T15:27:05.849000+00:00
...,...,...,...
2066,NeutralEvilGirl#4822,I didn't really get an answer in another chann...,2020-05-02T16:23:13.457000+00:00
2067,Henriques#8411,sorry 游땵,2020-05-02T02:37:11.762000+00:00
2068,Table#9151,Kinda weird bro,2020-05-02T02:37:27.694000+00:00
2069,Henriques#8411,"sorry guys, i'm a begginer, and i'm not good a...",2020-05-02T02:38:20.310000+00:00


# Data Preprocessing

# Bag of Words

In [6]:
count_vectorizer = CountVectorizer(analyzer='word', stop_words=stopwords.words('english'))
X_bag = pd.DataFrame(count_vectorizer.fit_transform(db['content']).toarray(),
                     columns=count_vectorizer.get_feature_names())

In [7]:
X_bag.shape

(2071, 3577)

In [8]:
len(X_bag.columns)

3577

In [38]:
X_bag

Unnamed: 0,00,000,01,02,022dcb46bceba00cc953aae4eaa13df7,03,04,06,07,09,...,zipline,zowie,zvfyr4n,zxvytel7ow4,풤풦풧픠픥풧,풤풯,풬풤풯풤,픢픣풧풫풭풧,픥풧픡,構억쎾構억쌿
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2066,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2067,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2068,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Tf-idf

In [21]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word', stop_words=stopwords.words('english'))
X_tfidf = pd.DataFrame(tfidf_vectorizer.fit_transform(db['content']).toarray(),
                 columns = tfidf_vectorizer.get_feature_names())

In [22]:
X_tfidf.shape

(2071, 3577)

In [23]:
X_tfidf.head()

Unnamed: 0,00,000,01,02,022dcb46bceba00cc953aae4eaa13df7,03,04,06,07,09,...,zipline,zowie,zvfyr4n,zxvytel7ow4,풤풦풧픠픥풧,풤풯,풬풤풯풤,픢픣풧풫풭풧,픥풧픡,構억쎾構억쌿
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
