In [63]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import regex as re
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_json("src/reviews_devset.json", lines=True)

In [6]:
df_selected = df[["reviewText", "category"]]

In [26]:
def preprocess(review):
    new_text = ""
    delimiters = ['(', ')', ']', '[', ']', '{', '}', '.', '!', '?', ',', ';',':','+','=','-','_','"','`','~','#','@','&','*','%','€','$','§','/','0','1','2','3','4','5','6','7','8','9']
    WORD_RE = re.compile(r'[\w]+')
    stops = set(i.strip() for i in open('./src/stopwords.txt'))              
    for word in WORD_RE.findall(review):
        if len(word) != 0 and word.lower() not in stops and any(x in word for x in delimiters) == False:
            new_text += " " + word.lower()
    return new_text

In [32]:
amazon = []
for inx, row in df_selected.iterrows():
    amazon.append(preprocess(row['reviewText']))

In [35]:
data_prep = pd.DataFrame(amazon, columns=["text"])

In [37]:
data_prep['category'] = df_selected.category

In [38]:
data_prep

Unnamed: 0,text,category
0,gift husband making things time love food dir...,Patio_Lawn_and_Garde
1,nice spreader feels solid pneumatic tires giv...,Patio_Lawn_and_Garde
2,metal base hose attachments poorly designed m...,Patio_Lawn_and_Garde
3,part works pretty good bought john deere work...,Patio_Lawn_and_Garde
4,hose supposed flexible hard heavy unwieldy ki...,Patio_Lawn_and_Garde
...,...,...
78824,wife good bought sooner love color big turn,Health_and_Personal_Care
78825,gave forskolin needed loosing pounds reach go...,Health_and_Personal_Care
78826,incorporated garcinia cambogia exercise food ...,Health_and_Personal_Care
78827,lip balms pina colada maui mike glides smooth...,Health_and_Personal_Care


In [172]:
tf = TfidfVectorizer()

vectorized = tf.fit_transform(data_prep.text)

In [173]:
vectorized

<78829x95638 sparse matrix of type '<class 'numpy.float64'>'
	with 2119083 stored elements in Compressed Sparse Row format>

In [174]:
test = SelectKBest(score_func=chi2, k=2000)

In [175]:
fit = test.fit(vectorized, data_prep.category)

In [188]:
vectorized.shape

(78829, 95638)

In [189]:
fit.pvalues_.shape

(95638,)

In [182]:
pvals = fit.get_feature_names_out(fit.pvalues_).reshape(-1,1)

In [183]:
tf.inverse_transform(pvals)

[array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array(['aaa'], dtype='<U112'),
 array([

In [None]:
def chi_square_test_new(predictions, actual, n_labels):
    
    degrees_of_freedom = n_labels-1
    chi_test_statistic = 0
    pred_distribution_dict = {}
    actual_distribution_dict= {}
    labels = list(range(n_labels))
    
    for label in labels:
        pred_distribution_dict[label] = 0
        actual_distribution_dict[label] = 0
    for label in labels:
        for pred in predictions:
            if pred == label:
                pred_distribution_dict[label]+=1
    for label in labels:
        for a in actual:
            if a == label:
                actual_distribution_dict[label]+=1
    
    new_actual = list(actual_distribution_dict.values())
    new_preds = list(pred_distribution_dict.values())
    for i in range(len(new_actual)):
        if new_actual[i] == 0:
            new_actual[i] = 1
            new_preds[i] = 1
    
    for i in range(len(new_preds)):
        chi_test_statistic += ((new_actual[i]-new_preds[i])**2)/(new_preds[i])
    
    answer_dict = {'degrees of freedom':degrees_of_freedom, 'chi square test statistic':chi_test_statistic}
    
    return answer_dict

In [98]:
df1

Unnamed: 0,text,category
0,gift husband making things time love food dir...,Patio_Lawn_and_Garde
1,nice spreader feels solid pneumatic tires giv...,Patio_Lawn_and_Garde
2,metal base hose attachments poorly designed m...,Patio_Lawn_and_Garde
3,part works pretty good bought john deere work...,Patio_Lawn_and_Garde
4,hose supposed flexible hard heavy unwieldy ki...,Patio_Lawn_and_Garde
...,...,...
989,original walls water plastic thin stand fille...,Patio_Lawn_and_Garde
990,fast shipping exact item size needed made eas...,Patio_Lawn_and_Garde
991,light produces run home glove box open protru...,Patio_Lawn_and_Garde
992,bought tie rain barrels bulkhead works great ...,Patio_Lawn_and_Garde


In [156]:
df1 = data_prep.head(300)
df1.shape

(300, 2)

In [162]:
df1

Unnamed: 0,text,category
0,gift husband making things time love food dir...,Patio_Lawn_and_Garde
1,nice spreader feels solid pneumatic tires giv...,Patio_Lawn_and_Garde
2,metal base hose attachments poorly designed m...,Patio_Lawn_and_Garde
3,part works pretty good bought john deere work...,Patio_Lawn_and_Garde
4,hose supposed flexible hard heavy unwieldy ki...,Patio_Lawn_and_Garde
...,...,...
295,door mine stay closed top expensive cheap qua...,Patio_Lawn_and_Garde
296,agree yard butler saves knees big,Patio_Lawn_and_Garde
297,arrive zillion pieces stand attach skinny wir...,Patio_Lawn_and_Garde
298,simplest trap set place tunnel step trap works,Patio_Lawn_and_Garde


In [157]:
tf = TfidfVectorizer(dtype=np.float64)

vectorized1 = tf.fit_transform(df1.text)

In [158]:
fit = test.fit(vectorized1, df1.category)

In [166]:
dfd = pd.DataFrame(vectorized1.toarray())
dfd

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2971,2972,2973,2974,2975,2976,2977,2978,2979,2980
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
296,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
297,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.249942,0.0,0.0,0.0,0.0,0.0
298,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [170]:
dfd.iloc[0][dfd.iloc[0] > 0]

149     0.197192
262     0.197192
317     0.197192
570     0.197192
572     0.197192
686     0.167148
766     0.094382
976     0.282590
1045    0.141295
1198    0.197192
1227    0.156115
1297    0.197192
1320    0.197192
1362    0.183897
1481    0.351329
1505    0.113090
1507    0.161170
1703    0.156115
1754    0.197192
1928    0.183897
1955    0.183897
1999    0.197192
2034    0.197192
2322    0.161170
2639    0.141295
2662    0.089125
2706    0.174464
2970    0.197192
Name: 0, dtype: float64

In [164]:
tf.inverse_transform(vectorized1.toarray())

[array(['barbecue', 'broadening', 'calls', 'cuisine', 'culture',
        'directions', 'easy', 'food', 'gift', 'horizons', 'husband',
        'insight', 'interpret', 'kinds', 'love', 'make', 'making', 'open',
        'page', 'produced', 'provided', 'raichlen', 'recipes', 'simple',
        'things', 'time', 'trail', 'yum'], dtype='<U15'),
 array(['arm', 'bumps', 'cable', 'control', 'crappy', 'distribution',
        'edgeguard', 'experimentation', 'farther', 'feels', 'flings',
        'give', 'good', 'great', 'handling', 'left', 'long',
        'maneuverability', 'material', 'metal', 'nice', 'pneumatic',
        'precise', 'products', 'settings', 'side', 'solid', 'spreader',
        'time', 'tires', 'true'], dtype='<U15'),
 array(['attachments', 'avoid', 'badly', 'base', 'bit', 'designed',
        'fairly', 'falls', 'fix', 'gilmour', 'heads', 'hose', 'junction',
        'leaks', 'made', 'metal', 'plastic', 'pointed', 'poorly', 'pops',
        'previous', 'reviewer', 'spike', 'sprinkler',

In [171]:
test = SelectKBest(score_func=chi2, k=2000)

In [None]:
fit = test.fit(vectorized, data_prep.category)