In [10]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from scipy.stats import binomtest,chisquare,ttest_1samp,chi2_contingency,ttest_ind,f_oneway,pearsonr
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder,LabelEncoder,MinMaxScaler,StandardScaler,PolynomialFeatures
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest,f_classif,f_regression
from sklearn.neighbors import LocalOutlierFactor
from sklearn.model_selection import train_test_split


In [11]:
df = sns.load_dataset('diamonds')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [12]:
train_set, test_set = train_test_split(df,test_size=0.2,random_state=0)

In [13]:
print("train_set:",train_set.shape)
print("test_set:",test_set.shape)


train_set: (43152, 10)
test_set: (10788, 10)


# Encoding

In [14]:
df['color'].unique()

['E', 'I', 'J', 'H', 'F', 'G', 'D']
Categories (7, object): ['D', 'E', 'F', 'G', 'H', 'I', 'J']

In [16]:
cut_oder = ['Ideal', 'Premium', 'Good', 'Very Good', 'Fair']
clarity_order = ['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF']
color_oder = ['E', 'I', 'J', 'H', 'F', 'G', 'D']

encoder = OrdinalEncoder(
    categories=[cut_oder,color_oder,clarity_order]
)
encoding_results = encoder.fit_transform(train_set[['cut','color','clarity']])
encoding_results

array([[0., 5., 2.],
       [0., 5., 3.],
       [0., 0., 4.],
       ...,
       [1., 1., 2.],
       [0., 5., 7.],
       [0., 4., 0.]], shape=(43152, 3))

In [19]:
train_set.loc[:,['cut','color','clarity']]=encoding_results

  train_set.loc[:,['cut','color','clarity']]=encoding_results
  train_set.loc[:,['cut','color','clarity']]=encoding_results
  train_set.loc[:,['cut','color','clarity']]=encoding_results


In [20]:
train_set

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
26250,1.63,0.0,5.0,2.0,61.7,55.0,15697,7.56,7.60,4.68
31510,0.34,0.0,5.0,3.0,62.2,57.0,765,4.47,4.44,2.77
40698,0.40,0.0,0.0,4.0,61.7,56.0,1158,4.73,4.77,2.93
42634,0.58,1.0,3.0,1.0,62.1,55.0,1332,5.38,5.35,3.33
47714,0.63,3.0,6.0,1.0,62.8,57.0,1885,5.40,5.46,3.41
...,...,...,...,...,...,...,...,...,...,...
45891,0.52,1.0,4.0,3.0,60.7,59.0,1720,5.18,5.14,3.13
52416,0.70,2.0,6.0,1.0,63.6,60.0,2512,5.59,5.51,3.51
42613,0.32,1.0,1.0,2.0,61.3,58.0,505,4.35,4.39,2.68
43567,0.41,0.0,5.0,7.0,61.0,57.0,1431,4.81,4.79,2.93


# Normalisation

In [21]:
normalisation = MinMaxScaler()
train_set = normalisation.fit_transform(train_set)
train_set

array([[0.2972973 , 0.        , 0.83333333, ..., 0.70391061, 0.12903226,
        0.14716981],
       [0.02910603, 0.        , 0.83333333, ..., 0.41620112, 0.075382  ,
        0.08710692],
       [0.04158004, 0.        , 0.        , ..., 0.44040968, 0.08098472,
        0.09213836],
       ...,
       [0.02494802, 0.25      , 0.16666667, ..., 0.40502793, 0.07453311,
        0.08427673],
       [0.04365904, 0.        , 0.83333333, ..., 0.44785847, 0.08132428,
        0.09213836],
       [0.14760915, 0.        , 0.66666667, ..., 0.58100559, 0.10509338,
        0.11949686]], shape=(43152, 10))

In [22]:
pd.DataFrame(train_set,columns=df.columns)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.297297,0.00,0.833333,0.285714,0.519444,0.333333,0.830990,0.703911,0.129032,0.147170
1,0.029106,0.00,0.833333,0.428571,0.533333,0.388889,0.023681,0.416201,0.075382,0.087107
2,0.041580,0.00,0.000000,0.571429,0.519444,0.361111,0.044929,0.440410,0.080985,0.092138
3,0.079002,0.25,0.500000,0.142857,0.530556,0.333333,0.054336,0.500931,0.090832,0.104717
4,0.089397,0.75,1.000000,0.142857,0.550000,0.388889,0.084234,0.502793,0.092699,0.107233
...,...,...,...,...,...,...,...,...,...,...
43147,0.066528,0.25,0.666667,0.428571,0.491667,0.444444,0.075314,0.482309,0.087267,0.098428
43148,0.103950,0.50,1.000000,0.142857,0.572222,0.472222,0.118134,0.520484,0.093548,0.110377
43149,0.024948,0.25,0.166667,0.285714,0.508333,0.416667,0.009624,0.405028,0.074533,0.084277
43150,0.043659,0.00,0.833333,1.000000,0.500000,0.388889,0.059689,0.447858,0.081324,0.092138


# Transformation du test set

In [24]:
encoding_results = encoder.transform(test_set[['cut','color','clarity']])
encoding_results

array([[0., 3., 0.],
       [0., 3., 1.],
       [1., 1., 1.],
       ...,
       [3., 4., 0.],
       [0., 5., 5.],
       [2., 0., 4.]], shape=(10788, 3))

In [27]:
test_set.loc[:,['cut','color','clarity']]=encoding_results

  test_set.loc[:,['cut','color','clarity']]=encoding_results
  test_set.loc[:,['cut','color','clarity']]=encoding_results
  test_set.loc[:,['cut','color','clarity']]=encoding_results


In [28]:
pd.DataFrame(test_set,columns=df.columns)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
10176,1.10,0.0,3.0,0.0,62.0,55.0,4733,6.61,6.65,4.11
16083,1.29,0.0,3.0,1.0,62.6,56.0,6424,6.96,6.93,4.35
13420,1.20,1.0,1.0,1.0,61.1,58.0,5510,6.88,6.80,4.18
20407,1.50,0.0,4.0,1.0,60.9,56.0,8770,7.43,7.36,4.50
8909,0.90,3.0,4.0,3.0,61.7,57.0,4493,6.17,6.21,3.82
...,...,...,...,...,...,...,...,...,...,...
42208,0.52,2.0,3.0,3.0,63.6,57.0,1289,5.05,5.10,3.23
3638,0.91,3.0,5.0,0.0,60.4,61.0,3435,6.21,6.28,3.77
5508,1.08,3.0,4.0,0.0,63.4,55.0,3847,6.53,6.50,4.13
19535,1.02,0.0,5.0,5.0,61.5,57.0,8168,6.44,6.47,3.97


In [30]:
test_set = normalisation.transform(test_set)
test_set



array([[-0.00267979,  0.        ,  0.08333333, ...,  0.05730505,
         0.00191686,  0.00406432],
       [ 0.00553248,  0.        ,  0.08333333, ...,  0.06033936,
         0.00199757,  0.00430165],
       [ 0.00164245,  0.0625    ,  0.02777778, ...,  0.0596458 ,
         0.0019601 ,  0.00413354],
       ...,
       [-0.00354424,  0.1875    ,  0.11111111, ...,  0.05661149,
         0.00187363,  0.00408409],
       [-0.00613759,  0.        ,  0.13888889, ...,  0.05583124,
         0.00186498,  0.00392587],
       [-0.02861329,  0.125     ,  0.        , ...,  0.04308716,
         0.00144125,  0.0031941 ]], shape=(10788, 10))

In [31]:
pd.DataFrame(test_set,columns=df.columns)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,-0.002680,0.0000,0.083333,0.000000,-1.179784,-1.185185,-0.017667,0.057305,0.001917,0.004064
1,0.005532,0.0000,0.083333,0.020408,-1.179321,-1.184414,-0.017662,0.060339,0.001998,0.004302
2,0.001642,0.0625,0.027778,0.020408,-1.180478,-1.182870,-0.017664,0.059646,0.001960,0.004134
3,0.014609,0.0000,0.111111,0.020408,-1.180633,-1.184414,-0.017655,0.064414,0.002122,0.004450
4,-0.011324,0.1875,0.111111,0.061224,-1.180015,-1.183642,-0.017667,0.053490,0.001790,0.003778
...,...,...,...,...,...,...,...,...,...,...
10783,-0.027749,0.1250,0.083333,0.061224,-1.178549,-1.183642,-0.017677,0.043781,0.001470,0.003194
10784,-0.010892,0.1875,0.138889,0.000000,-1.181019,-1.180556,-0.017670,0.053837,0.001810,0.003728
10785,-0.003544,0.1875,0.111111,0.000000,-1.178704,-1.185185,-0.017669,0.056611,0.001874,0.004084
10786,-0.006138,0.0000,0.138889,0.102041,-1.180170,-1.183642,-0.017657,0.055831,0.001865,0.003926
