In [1]:
import numpy as np
import pandas as pd
import json 
import imblearn

In [2]:
from sklearn.manifold import Isomap, TSNE, MDS, LocallyLinearEmbedding
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_blobs
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.cluster import KMeans

from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [3]:
from missingpy import MissForest



In [4]:
df = pd.read_json('features_label.json')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232748 entries, 0 to 232747
Columns: 120 entries, 0 to class
dtypes: float64(118), int64(1), object(1)
memory usage: 214.9+ MB


In [5]:
df['class'] = df['class'].astype('category')
df.groupby('class').size()

class
Q1      59
Q2    9974
Q4    9885
dtype: int64

In [6]:
from sklearn import preprocessing
categorical = ['class']
le = preprocessing.LabelEncoder()
df[categorical] = df[categorical].apply(lambda series: pd.Series(
    le.fit_transform(series[series.notnull()]),
    index=series[series.notnull()].index
))

In [7]:
from collections import Counter
from imblearn.over_sampling import SMOTE

In [8]:
oversample = SMOTE()
X, y = oversample.fit_resample(df.loc[df['class'].notnull()].reset_index(), df.loc[df['class'].notnull(),'class'])
# summarize the new class distribution
counter = Counter(y)
print(counter)

Counter({0.0: 9974, 1.0: 9974, 2.0: 9974})


In [9]:
len(pd.unique(X['index']))

19934

In [10]:
X.groupby('class').size()

class
0.0    9974
1.0    9974
2.0    9974
dtype: int64

In [11]:
X.head()

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,110,111,112,113,114,115,116,117,118,class
0,154175,-0.289675,0.394769,0.256888,-0.317372,-0.237715,0.167305,-0.1987,0.171249,-0.033327,...,0.001391,0.840816,0.179573,0.485714,0.288557,1,1.0,3.951378,0.0,0.0
1,154177,-0.203194,0.273622,0.243855,-0.281782,0.095889,0.274552,-0.137389,0.273008,-0.054578,...,2.6e-05,0.69747,0.240055,0.495575,0.288664,1,1.0,3.457491,0.0,0.0
2,154179,-0.112905,0.204361,0.368729,-0.302603,-0.023144,0.15247,-0.110966,0.133138,-0.06807,...,-0.000165,0.652331,0.247898,0.494681,0.288659,1,1.0,3.258881,0.0,0.0
3,154180,-0.165507,0.221519,0.352529,-0.249288,-0.080767,0.211572,-0.242924,0.189266,-0.148492,...,-0.000314,0.691111,0.241398,0.491667,0.288635,2,0.983333,3.456722,0.016667,0.0
4,154186,-0.225182,0.188634,0.405607,-0.091972,-0.033343,0.101976,-0.136606,0.192652,-0.09423,...,-0.000344,0.713955,0.229247,0.495283,0.288662,3,0.981132,3.583734,0.018868,0.0


In [12]:
a = df.loc[df['class'].isnull()].reset_index()

In [13]:
overSampled = pd.concat([X,a]).set_index('index')

In [14]:
overSampled.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,class
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
154175,-0.289675,0.394769,0.256888,-0.317372,-0.237715,0.167305,-0.1987,0.171249,-0.033327,-0.001526,...,0.001391,0.840816,0.179573,0.485714,0.288557,1,1.0,3.951378,0.0,0.0
154177,-0.203194,0.273622,0.243855,-0.281782,0.095889,0.274552,-0.137389,0.273008,-0.054578,0.027165,...,2.6e-05,0.69747,0.240055,0.495575,0.288664,1,1.0,3.457491,0.0,0.0
154179,-0.112905,0.204361,0.368729,-0.302603,-0.023144,0.15247,-0.110966,0.133138,-0.06807,-0.022307,...,-0.000165,0.652331,0.247898,0.494681,0.288659,1,1.0,3.258881,0.0,0.0
154180,-0.165507,0.221519,0.352529,-0.249288,-0.080767,0.211572,-0.242924,0.189266,-0.148492,-0.037466,...,-0.000314,0.691111,0.241398,0.491667,0.288635,2,0.983333,3.456722,0.016667,0.0
154186,-0.225182,0.188634,0.405607,-0.091972,-0.033343,0.101976,-0.136606,0.192652,-0.09423,0.021252,...,-0.000344,0.713955,0.229247,0.495283,0.288662,3,0.981132,3.583734,0.018868,0.0


In [15]:
%%time

imputer = MissForest(random_state=42, class_weight='balanced')
imputed = imputer.fit_transform(overSampled, cat_vars=119)
imputed

Iteration: 0
Iteration: 1
Iteration: 2
CPU times: user 5min 9s, sys: 1.04 s, total: 5min 10s
Wall time: 43.2 s


array([[-0.28967483,  0.39476911,  0.25688809, ...,  3.95137796,
         0.        ,  0.        ],
       [-0.20319405,  0.27362184,  0.24385467, ...,  3.45749102,
         0.        ,  0.        ],
       [-0.11290469,  0.20436135,  0.36872937, ...,  3.25888057,
         0.        ,  0.        ],
       ...,
       [-0.11628602,  0.29573326,  0.40448253, ...,  4.0708538 ,
         0.        ,  1.        ],
       [-0.09307639,  0.14522664,  0.37296448, ...,  3.90355169,
         0.0060241 ,  1.        ],
       [-0.18077336,  0.1715945 ,  0.29286598, ...,  2.947734  ,
         0.00746269,  1.        ]])

In [16]:
cat_cols=['class']
imputed = pd.DataFrame(imputed, columns=df.columns.tolist())

In [17]:
imputed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,class
0,-0.289675,0.394769,0.256888,-0.317372,-0.237715,0.167305,-0.1987,0.171249,-0.033327,-0.001526,...,0.001391,0.840816,0.179573,0.485714,0.288557,1.0,1.0,3.951378,0.0,0.0
1,-0.203194,0.273622,0.243855,-0.281782,0.095889,0.274552,-0.137389,0.273008,-0.054578,0.027165,...,2.6e-05,0.69747,0.240055,0.495575,0.288664,1.0,1.0,3.457491,0.0,0.0
2,-0.112905,0.204361,0.368729,-0.302603,-0.023144,0.15247,-0.110966,0.133138,-0.06807,-0.022307,...,-0.000165,0.652331,0.247898,0.494681,0.288659,1.0,1.0,3.258881,0.0,0.0
3,-0.165507,0.221519,0.352529,-0.249288,-0.080767,0.211572,-0.242924,0.189266,-0.148492,-0.037466,...,-0.000314,0.691111,0.241398,0.491667,0.288635,2.0,0.983333,3.456722,0.016667,0.0
4,-0.225182,0.188634,0.405607,-0.091972,-0.033343,0.101976,-0.136606,0.192652,-0.09423,0.021252,...,-0.000344,0.713955,0.229247,0.495283,0.288662,3.0,0.981132,3.583734,0.018868,0.0


In [18]:
imputed.groupby('class').size()

class
0.0     10240
1.0    158450
2.0     74062
dtype: int64

In [19]:
metadata = pd.read_json('data_and_meta.json')
metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 232748 entries, 0 to 284530
Data columns (total 16 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   index           232748 non-null  int64 
 1   reviewerID      232748 non-null  object
 2   asin            232748 non-null  object
 3   reviewerName    230910 non-null  object
 4   helpful         232748 non-null  object
 5   reviewText      232748 non-null  object
 6   overall         232748 non-null  int64 
 7   summary         232748 non-null  object
 8   unixReviewTime  232748 non-null  int64 
 9   reviewTime      232748 non-null  object
 10  features        232748 non-null  object
 11  amazonCategory  232748 non-null  object
 12  class           19918 non-null   object
 13  title           106284 non-null  object
 14  description     232748 non-null  object
 15  feature         232748 non-null  object
dtypes: int64(3), object(13)
memory usage: 30.2+ MB


In [20]:
imputed['CLASS'] = le.inverse_transform(imputed['class'].astype(int)[imputed['class'].notnull()])
imputed.groupby('CLASS').size()

CLASS
Q1     10240
Q2    158450
Q4     74062
dtype: int64

In [21]:
imputed

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,class,CLASS
0,-0.289675,0.394769,0.256888,-0.317372,-0.237715,0.167305,-0.198700,0.171249,-0.033327,-0.001526,...,0.840816,0.179573,0.485714,0.288557,1.0,1.000000,3.951378,0.000000,0.0,Q1
1,-0.203194,0.273622,0.243855,-0.281782,0.095889,0.274552,-0.137389,0.273008,-0.054578,0.027165,...,0.697470,0.240055,0.495575,0.288664,1.0,1.000000,3.457491,0.000000,0.0,Q1
2,-0.112905,0.204361,0.368729,-0.302603,-0.023144,0.152470,-0.110966,0.133138,-0.068070,-0.022307,...,0.652331,0.247898,0.494681,0.288659,1.0,1.000000,3.258881,0.000000,0.0,Q1
3,-0.165507,0.221519,0.352529,-0.249288,-0.080767,0.211572,-0.242924,0.189266,-0.148492,-0.037466,...,0.691111,0.241398,0.491667,0.288635,2.0,0.983333,3.456722,0.016667,0.0,Q1
4,-0.225182,0.188634,0.405607,-0.091972,-0.033343,0.101976,-0.136606,0.192652,-0.094230,0.021252,...,0.713955,0.229247,0.495283,0.288662,3.0,0.981132,3.583734,0.018868,0.0,Q1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242747,-0.186514,0.099457,0.393527,-0.186841,0.037206,0.140150,-0.143451,0.161884,-0.102061,0.058485,...,0.647394,0.252082,0.499289,0.288675,2.0,0.998578,3.321696,0.001422,1.0,Q2
242748,-0.212278,0.177555,0.390303,-0.246410,-0.130181,0.186109,-0.030990,0.232672,-0.103300,0.034690,...,0.784241,0.210243,0.491228,0.288631,2.0,0.982456,3.925651,0.017544,1.0,Q2
242749,-0.116286,0.295733,0.404483,-0.161157,-0.175289,0.202572,-0.013252,0.300210,0.076071,-0.095648,...,0.836601,0.142492,0.490196,0.288620,1.0,1.000000,4.070854,0.000000,1.0,Q2
242750,-0.093076,0.145227,0.372964,-0.218381,-0.008757,0.194206,-0.045708,0.268212,-0.103843,0.044485,...,0.789229,0.211758,0.496988,0.288670,2.0,0.993976,3.903552,0.006024,1.0,Q2
