In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

Task: To predict whether the user likes the mobile phone or not. <br>
Assumption: If the average rating of mobile >= threshold, then the user likes it, otherwise not.

<b>Missing values:</b><br>
'Also Known As'(459),'Applications'(421),'Audio Features'(437),'Bezel-less display'(266),'Browser'(449),'Build Material'(338),'Co-Processor'(451),'Display Colour'(457),'Mobile High-Definition Link(MHL)'(472),'Music'(447)
'Email','Fingerprint Sensor Position'(174),'Games'(446),'HDMI'(454),'Heart Rate Monitor'(467),'IRIS Scanner'(467),
'Optical Image Stabilisation'(219),'Other Facilities'(444),'Phone Book'(444),'Physical Aperture'(87),'Quick Charging'(122),'Ring Tone'(444),'Ruggedness'(430),SAR Value(315),'SIM 3'(472),'SMS'(470)', 'Screen Protection'(229),'Screen to Body Ratio (claimed by the brand)'(428),'Sensor'(242),'Software Based Aperture'(473),
'Special Features'(459),'Standby time'(334),'Stylus'(473),'TalkTime'(259), 'USB Type-C'(374),'Video Player'(456),
'Video Recording Features'(458),'Waterproof'(398),'Wireless Charging','USB OTG Support'(159), 'Video ,'Recording'(113),'Java'(471),'Browser'(448)

<b>Very low variance:</b><br>
'Architecture'(most entries are 64-bit),'Audio Jack','GPS','Loudspeaker','Network','Network Support','Other Sensors'(28),'SIM Size', 'VoLTE'


<b>Multivalued:</b><br>
'Colours','Custom UI','Model'(1),'Other Sensors','Launch Date'

<b>Not important:</b><br>
'Bluetooth', 'Settings'(75),'Wi-Fi','Wi-Fi Features'

<b>Doubtful:</b><br>
'Aspect Ratio','Autofocus','Brand','Camera Features','Fingerprint Sensor'(very few entries are missing),
'Fingerprint Sensor Position', 'Graphics'(multivalued),'Image resolution'(multivalued),'SIM Size','Sim Slot(s)', 'User Available Storage', 'SIM 1', 'SIM 2','Shooting Modes', 'Touch Screen'(24), 'USB Connectivity'
    
<b>To check:</b><br>
'Display Type','Expandable Memory','FM Radio'

<b>High Correlation with other features</b><br>
'SIM Slot(s)' high correlation with SIM1
'Weight' has high high correlation with capacity , screen-to-body ratio
'Height' - screen size is also there
    
<b>Given a mobile, we can't directly get these features</b><br>
'Rating Count', 'Review Count'

<b>Keeping:</b><br>
'Capacity','Flash'(17),'Height'(22),'Internal Memory'(20, require cleaning),'Operating System'(25, require cleaning), 'Pixel Density'(1, clean it),'Processor'(22, clean it), 'RAM'(17, clean), 'Rating','Resolution'(cleaning), 'Screen Resolution','Screen Size', 'Thickness'(22), 'Type','User Replaceable','Weight'(cleaning),'Sim Size'(), 'Other Sensors'(28), 'Screen to Body Ratio (calculated)','Width',


In [2]:
# read data from file
train = pd.read_csv("../input/padhai-mp-neuron-like-unlike-classification/train.csv") 
test = pd.read_csv("../input/padhai-mp-neuron-like-unlike-classification/test.csv")

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 355
Number of features in train: 99
Number of data points in test: 119
Number of features in test: 98


In [3]:
def data_clean(data):
    
    # Let's first remove all missing value features
    columns_to_remove = ['Also Known As','Applications','Audio Features','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Browser'
                         'Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support', 'Video Recording','Java']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    #Features having very low variance 
    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Multivalued:
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    # Not much important
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

# Removing features

In [4]:
train = data_clean(train)
test = data_clean(test)

removing all those data points in which more than 15 features are missing 

In [5]:
train = train[(train.isnull().sum(axis=1) <= 15)]
# You shouldn't remove data points from test set
#test = test[(test.isnull().sum(axis=1) <= 15)]

In [6]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 47
Number of data points in test: 119
Number of features in test: 46


# Filling Missing values

In [7]:
def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass


In [8]:
def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

In [9]:
train = data_clean_2(train)
test = data_clean_2(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 51
Number of data points in test: 119
Number of features in test: 50


Not very important feature

In [10]:
def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Browser','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','User Replaceable','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Width','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data

In [11]:
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

Number of data points in train: 341
Number of features in train: 18
Number of data points in test: 119
Number of features in test: 17


In [12]:
# one hot encoding

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

(460, 17)
Index(['PhoneId', 'Screen Size', 'Sim1', 'Internal Memory',
       'Processor_frequency', 'Brand', 'SIM Slot(s)',
       'Screen to Body Ratio (calculated)', 'os_name', 'Resolution', 'Height',
       'RAM', 'Capacity', 'Weight', 'Pixel Density', 'SIM 2', 'Num_cores'],
      dtype='object')
(460, 87)
Index(['PhoneId', 'Screen Size', 'Internal Memory', 'Processor_frequency',
       'Screen to Body Ratio (calculated)', 'Resolution', 'Height', 'RAM',
       'Capacity', 'Weight', 'Pixel Density', 'Sim1_2G', 'Sim1_3G', 'Sim1_4G',
       'Brand_10.or', 'Brand_Apple', 'Brand_Asus', 'Brand_Billion',
       'Brand_Blackberry', 'Brand_Comio', 'Brand_Coolpad', 'Brand_Do',
       'Brand_Gionee', 'Brand_Google', 'Brand_HTC', 'Brand_Honor',
       'Brand_Huawei', 'Brand_InFocus', 'Brand_Infinix', 'Brand_Intex',
       'Brand_Itel', 'Brand_Jivi', 'Brand_Karbonn', 'Brand_LG', 'Brand_Lava',
       'Brand_LeEco', 'Brand_Lenovo', 'Brand_Lephone', 'Brand_Lyf',
       'Brand_Meizu', 'Brand_Microma

In [13]:
train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

In [14]:
# check the number of features and data points in train
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# check the number of features and data points in test
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

Number of data points in train: 341
Number of features in train: 88
Number of data points in test: 119
Number of features in test: 87


In [15]:
train_new.head()

Unnamed: 0,PhoneId,Screen Size,Internal Memory,Processor_frequency,Screen to Body Ratio (calculated),Resolution,Height,RAM,Capacity,Weight,...,SIM 2_Other,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Rating
0,0,6.26,64,1.8,80.68,20,157.9,4,4000,182,...,0,0,0,0,0,1,0,0,0,4.5
1,1,6.2,32,1.8,80.85,8,156.2,3,4230,168,...,0,0,0,0,0,1,0,0,0,4.5
2,2,6.3,32,2.1,83.68,25,157.0,3,3500,168,...,0,0,0,0,0,1,0,0,0,4.4
3,4,6.0,64,2.2,74.78,24,159.8,4,3300,169,...,0,0,0,0,0,1,0,0,0,4.3
4,5,6.5,64,2.2,84.23,16,160.4,4,3750,175,...,0,0,0,0,0,1,0,0,0,4.4


In [16]:
test_new.head()

Unnamed: 0,PhoneId,Screen Size,Internal Memory,Processor_frequency,Screen to Body Ratio (calculated),Resolution,Height,RAM,Capacity,Weight,...,SIM 2_4G,SIM 2_Other,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa
0,3,6.2,16,1.8,80.85,5,156.2,2,4230,168,...,1,0,0,0,0,0,1,0,0,0
1,11,6.2,64,1.8,81.6,12,156.0,4,5000,205,...,1,0,0,0,0,0,1,0,0,0
2,13,6.3,64,2.0,83.84,25,156.7,6,3500,169,...,1,0,0,0,0,0,1,0,0,0
3,16,6.3,64,2.0,83.84,16,156.7,4,3500,169,...,1,0,0,0,0,0,1,0,0,0
4,19,5.99,64,1.8,77.43,20,158.6,4,4000,181,...,1,0,0,0,0,0,1,0,0,0


## Dummy Solution

In [17]:
#X1 = train_new['Rating'].map(lambda x: 1 if x >= 4 else 0)
#df = pd.DataFrame(X_rating,columns = ['Rating'])
X = train_new.drop('Rating',axis=1)
X1 = list(train_new['Rating'].map(lambda x: 1 if x >= 4 else 0))
X['Rating'] = pd.Series(X1)
X2 = X.drop('PhoneId',axis=1)
X3 = X.drop('RAM',axis = 1)
#X11 = pd.DataFrame(X1, columns = 
#train_new2 = pd.concat([df1,df2],axis = 1)
#train_new2.head()
X['RAM'].replace(512,0,inplace = True)
X.groupby('Capacity').max()
#plt.plot(YM['Rating'])
#ym.to_csv("visual.csv", index=True)
X.shape

(341, 88)

In [18]:
X.groupby('Screen to Body Ratio (calculated)').mean()

Unnamed: 0_level_0,PhoneId,Screen Size,Internal Memory,Processor_frequency,Resolution,Height,RAM,Capacity,Weight,Pixel Density,...,SIM 2_Other,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Rating
Screen to Body Ratio (calculated),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23.60,97.0,2.40,4.0,1.10,8.0,134.0,0.0,2000.0,58.0,167.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
28.26,75.0,2.45,4.0,1.10,2.0,133.4,0.0,1500.0,117.0,163.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
29.59,247.0,2.40,32.0,1.79,2.0,117.5,32.0,1200.0,91.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
30.11,163.5,2.40,32.0,1.79,8.0,118.0,9.5,1020.0,121.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
30.26,168.0,2.40,0.0,1.79,2.0,115.6,3.0,1200.0,161.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85.05,76.0,6.40,128.0,2.00,25.0,157.5,8.0,3500.0,182.0,403.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
85.43,104.5,6.41,128.0,2.80,16.0,157.5,7.0,3700.0,185.0,402.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
86.29,73.0,6.40,256.0,2.65,25.0,156.7,8.0,3730.0,186.0,403.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
86.47,42.0,6.59,128.0,2.80,8.0,162.0,8.0,4000.0,199.0,388.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [19]:
Y = X.copy()
Z = Y['PhoneId']
K = Y.drop('PhoneId',axis = 1)
X_train = K.drop('Rating',axis=1)
Y_train = K['Rating']

#X_train['RAM'].replace(512,0,inplace = True)
#X_train['RAM'].replace(1,3,inplace = True)
#X_train['RAM'].replace(2,7,inplace = True)
#X_train['RAM'].replace(3,6,inplace = True)
#X_train['RAM'].replace(4,8,inplace = True)
#X_train['RAM'].replace(6,9,inplace = True)
#X_train['RAM'].replace(8,8,inplace = True)
#X_train['RAM'].replace(16,10,inplace = True)
#X_train['RAM'].replace(32,10,inplace = True)
#X_train['RAM'].replace(64,10,inplace = True)

X_train['RAM_normalized'] = (X_train['RAM'] - X_train['RAM'].min()) / (X_train['RAM'].max() - X_train['RAM'].min())
X_train['RAM1'] = X_train['RAM'].apply(lambda x:1 if x>=0 else 0)
X_train['RAM2'] = X_train['RAM'].apply(lambda x:1 if x>=0.1 else 0)
X_train['RAM3'] = X_train['RAM'].apply(lambda x:1 if x>=0.2 else 0)
X_train['RAM4'] = X_train['RAM'].apply(lambda x:1 if x>=0.3 else 0)
X_train['RAM5'] = X_train['RAM'].apply(lambda x:1 if x>=0.4 else 0)
X_train['RAM6'] = X_train['RAM'].apply(lambda x:1 if x>=0.5 else 0)
X_train['RAM7'] = X_train['RAM'].apply(lambda x:1 if x>=0.6 else 0)
X_train['RAM8'] = X_train['RAM'].apply(lambda x:1 if x>=0.7 else 0)
X_train['RAM9'] = X_train['RAM'].apply(lambda x:1 if x>=0.8 else 0)
X_train['RAM10'] = X_train['RAM'].apply(lambda x:1 if x>=0.9 else 0)

#X_train['Internal Memory'].replace(0,0,inplace = True)
#X_train['Internal Memory'].replace(4,0,inplace = True)
#X_train['Internal Memory'].replace(8,3,inplace = True)
#X_train['Internal Memory'].replace(16,7,inplace = True)
#X_train['Internal Memory'].replace(32,7,inplace = True)
#X_train['Internal Memory'].replace(64,8,inplace = True)
#X_train['Internal Memory'].replace(128,9,inplace = True)
#X_train['Internal Memory'].replace(256,10,inplace = True)
#X_train['Internal Memory'].replace(512,10,inplace = True)

X_train['Internal_Memory_normalized'] = (X_train['Internal Memory'] - X_train['Internal Memory'].min()) / (X_train['Internal Memory'].max() - X_train['Internal Memory'].min())
X_train['Internal Memory1'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0 else 0)
X_train['Internal Memory2'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.1 else 0)
X_train['Internal Memory3'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.2 else 0)
X_train['Internal Memory4'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.3 else 0)
X_train['Internal Memory5'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.4 else 0)
X_train['Internal Memory6'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.5 else 0)
X_train['Internal Memory7'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.6 else 0)
X_train['Internal Memory8'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.7 else 0)
X_train['Internal Memory9'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.8 else 0)
X_train['Internal Memory10'] = X_train['Internal Memory'].apply(lambda x:1 if x>=0.9 else 0)

#X_train['Height_normalized'] = (X_train['Height'] - X_train['Height'].min()) / (X_train['Height'].max() - X_train['Height'].min())
#X_train['Height_normalized1'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0 else 0)
#X_train['Height_normalized2'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.1 else 0)
#X_train['Height_normalized3'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.2 else 0)
#X_train['Height_normalized4'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.3 else 0)
#X_train['Height_normalized5'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.4 else 0)
#X_train['Height_normalized6'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.5 else 0)
#X_train['Height_normalized7'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.6 else 0)
#X_train['Height_normalized8'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.7 else 0)
#X_train['Height_normalized9'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.8 else 0)
#X_train['Height_normalized10'] = X_train['Height_normalized'].apply(lambda x:1 if x>=0.9 else 0)

X_train['Screen to Body Ratio (calculated)_normalized'] = (X_train['Screen to Body Ratio (calculated)'] - X_train['Screen to Body Ratio (calculated)'].min()) / (X_train['Screen to Body Ratio (calculated)'].max() - X_train['Screen to Body Ratio (calculated)'].min())
X_train['Screen to Body Ratio (calculated)_normalized1'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0 else 0)
X_train['Screen to Body Ratio (calculated)_normalized2'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.1 else 0)
X_train['Screen to Body Ratio (calculated)_normalized3'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.2 else 0)
X_train['Screen to Body Ratio (calculated)_normalized4'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.3 else 0)
X_train['Screen to Body Ratio (calculated)_normalized5'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.4 else 0)
X_train['Screen to Body Ratio (calculated)_normalized6'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.5 else 0)
X_train['Screen to Body Ratio (calculated)_normalized7'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.6 else 0)
X_train['Screen to Body Ratio (calculated)_normalized8'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.7 else 0)
X_train['Screen to Body Ratio (calculated)_normalized9'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.8 else 0)
X_train['Screen to Body Ratio (calculated)_normalized10'] = X_train['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.9 else 0)


#X_train['Screen_Size_normalized'] = (X_train['Screen Size'] - X_train['Screen Size'].min()) / (X_train['Screen Size'].max() - X_train['Screen Size'].min())
#bins = [0,0.99,1.99,2.99,3.99,4.99,5.99,6.99,7.99,8.99,9.99]
#labels = [0,1,2,3,4,5,6,7,8,9]
#X['SBRC_binned'] = pd.cut(X['Screen_Size_normalized'], bins=bins, labels=labels)
#X.head()

#X_train['Screen to Body Ratio (calculated)_normalized'] = (X_train['Screen to Body Ratio (calculated)'] - X_train['Screen to Body Ratio (calculated)'].min()) / (X_train['Screen to Body Ratio (calculated)'].max() - X_train['Screen to Body Ratio (calculated)'].min())

#X_train['log_Weight']=(X_train['Weight']+1).transform(np.log)
#X_train['log_Pixel Density']=(X_train['Pixel Density']+1).transform(np.log)
#X_train['log_Screen Size']=(X_train['Screen Size']+1).transform(np.log)
#X_train['log_Capacity']=(X_train['Capacity']+1).transform(np.log)
#X_train['log_Resolution']=(X_train['Resolution']+1).transform(np.log)
#X_train['log_Processor_frequency']=(X_train['Processor_frequency']+1).transform(np.log)


#XX_train = X_train.drop('Weight',axis=1)
#XX_train = XX_train.drop('Screen Size',axis=1)
XX_train = X_train.drop('RAM',axis=1)
#XX_train = XX_train.drop('Height',axis=1)
XX_train = XX_train.drop('Internal Memory',axis=1)
#XX_train = XX_train.drop('Height_normalized',axis=1)
#XX_train = XX_train.drop('Pixel Density',axis=1)
#XX_train = XX_train.drop('Capacity',axis=1)
XX_train = XX_train.drop('Screen to Body Ratio (calculated)',axis=1)
XX_train = XX_train.drop('Screen to Body Ratio (calculated)_normalized',axis=1)
XX_train = XX_train.drop('RAM_normalized',axis=1)
XX_train = XX_train.drop('Internal_Memory_normalized',axis=1)
#XX_train = XX_train.drop('Processor_frequency',axis=1)
#XX_train = XX_train.drop('Resolution',axis=1)

X_train = XX_train.copy()
#X_train.head()
X_binarized_train = X_train.apply(pd.cut,bins = 2,labels =[0,1])
X_binarized_train = X_binarized_train.values
Y_train = Y_train.values
#X_train = X_train.values
#Y_train = Y_train.values

X_binarized_train.shape
#X['Screen_Size_normalized'].max()



(341, 113)

In [20]:
L=[]
L.append(X_binarized_train.dtypes)
print(L)

AttributeError: 'numpy.ndarray' object has no attribute 'dtypes'

In [21]:
D=[]
D.append(H.dtypes)
print(D)

NameError: name 'H' is not defined

In [22]:
'''
X_train['RAM'].replace(512,0,inplace = True)
X_train['RAM'].replace(1,3,inplace = True)
X_train['RAM'].replace(2,7,inplace = True)
X_train['RAM'].replace(3,6,inplace = True)
X_train['RAM'].replace(4,8,inplace = True)
X_train['RAM'].replace(6,9,inplace = True)
X_train['RAM'].replace(8,8,inplace = True)
X_train['RAM'].replace(16,10,inplace = True)
X_train['RAM'].replace(32,10,inplace = True)
X_train['RAM'].replace(64,10,inplace = True)

X_train['Internal Memory'].replace(0,0,inplace = True)
X_train['Internal Memory'].replace(4,0,inplace = True)
X_train['Internal Memory'].replace(8,3,inplace = True)
X_train['Internal Memory'].replace(16,7,inplace = True)
X_train['Internal Memory'].replace(32,7,inplace = True)
X_train['Internal Memory'].replace(64,8,inplace = True)
X_train['Internal Memory'].replace(128,9,inplace = True)
X_train['Internal Memory'].replace(256,10,inplace = True)
X_train['Internal Memory'].replace(512,10,inplace = True)
'''


'''
X_train['RAM1'] = X_train['RAM'].apply(lambda x:1 if x>=1 else 0)
X_train['RAM2'] = X_train['RAM'].apply(lambda x:1 if x>=2 else 0)
X_train['RAM3'] = X_train['RAM'].apply(lambda x:1 if x>=3 else 0)
X_train['RAM4'] = X_train['RAM'].apply(lambda x:1 if x>=4 else 0)
X_train['RAM5'] = X_train['RAM'].apply(lambda x:1 if x>=5 else 0)
X_train['RAM6'] = X_train['RAM'].apply(lambda x:1 if x>=6 else 0)
X_train['RAM7'] = X_train['RAM'].apply(lambda x:1 if x>=7 else 0)
X_train['RAM8'] = X_train['RAM'].apply(lambda x:1 if x>=8 else 0)
X_train['RAM9'] = X_train['RAM'].apply(lambda x:1 if x>=9 else 0)
X_train['RAM10'] = X_train['RAM'].apply(lambda x:1 if x>=10 else 0)

X_train['Internal Memory1'] = X_train['Internal Memory'].apply(lambda x:1 if x>=1 else 0)
X_train['Internal Memory2'] = X_train['Internal Memory'].apply(lambda x:1 if x>=2 else 0)
X_train['Internal Memory3'] = X_train['Internal Memory'].apply(lambda x:1 if x>=3 else 0)
X_train['Internal Memory4'] = X_train['Internal Memory'].apply(lambda x:1 if x>=4 else 0)
X_train['Internal Memory5'] = X_train['Internal Memory'].apply(lambda x:1 if x>=5 else 0)
X_train['Internal Memory6'] = X_train['Internal Memory'].apply(lambda x:1 if x>=6 else 0)
X_train['Internal Memory7'] = X_train['Internal Memory'].apply(lambda x:1 if x>=7 else 0)
X_train['Internal Memory8'] = X_train['Internal Memory'].apply(lambda x:1 if x>=8 else 0)
X_train['Internal Memory9'] = X_train['Internal Memory'].apply(lambda x:1 if x>=9 else 0)
X_train['Internal Memory10'] = X_train['Internal Memory'].apply(lambda x:1 if x>=10 else 0)
'''

'''
bins = [1000,1999,2999,3999,4999,5999,6999,7999,8999,9999,10999,11999,13000]
labels = [1,2,3,4,5,6,7,8,9,10,11,12]
X['Capacity_binned'] = pd.cut(X['Capacity'], bins=bins, labels=labels)

bins = [100,199,299,399,499,599]
labels = [1,2,3,4,5]
X['Pixel_Density_binned'] = pd.cut(X['Pixel Density'], bins=bins, labels=labels)

bins = [0,9,19,29]
labels = [1,2,3]
X['Resolution_binned'] = pd.cut(X['Resolution'], bins=bins, labels=labels)

bins = [0,0.99,1.99,2.99,3.99,4.99,5.99,6.99]
labels = [0,1,2,3,4,5,6]
X['Screen_Size_binned'] = pd.cut(X['Screen Size'], bins=bins, labels=labels)

bins = [0,0.99,1.99,2.99]
labels = [0,1,2]
X['Processor_frequency_binned'] = pd.cut(X['Processor_frequency'], bins=bins, labels=labels)
'''

'''
X['SBRC_binned2'] = X['SBRC_binned'].apply(lambda x:1 if x>=2 else 0)
X['SBRC_binned3'] = X['SBRC_binned'].apply(lambda x:1 if x>=3 else 0)
X['SBRC_binned4'] = X['SBRC_binned'].apply(lambda x:1 if x>=4 else 0)
X['SBRC_binned5'] = X['SBRC_binned'].apply(lambda x:1 if x>=5 else 0)
X['SBRC_binned6'] = X['SBRC_binned'].apply(lambda x:1 if x>=6 else 0)
X['SBRC_binned7'] = X['SBRC_binned'].apply(lambda x:1 if x>=7 else 0)
X['SBRC_binned8'] = X['SBRC_binned'].apply(lambda x:1 if x>=8 else 0)


X['Capacity_binned1'] = X['Capacity_binned'].apply(lambda x:1 if x>=1 else 0)
X['Capacity_binned2'] = X['Capacity_binned'].apply(lambda x:1 if x>=2 else 0)
X['Capacity_binned3'] = X['Capacity_binned'].apply(lambda x:1 if x>=3 else 0)
X['Capacity_binned4'] = X['Capacity_binned'].apply(lambda x:1 if x>=4 else 0)
X['Capacity_binned5'] = X['Capacity_binned'].apply(lambda x:1 if x>=5 else 0)
X['Capacity_binned6'] = X['Capacity_binned'].apply(lambda x:1 if x>=6 else 0)
X['Capacity_binned7'] = X['Capacity_binned'].apply(lambda x:1 if x>=7 else 0)
X['Capacity_binned8'] = X['Capacity_binned'].apply(lambda x:1 if x>=8 else 0)
X['Capacity_binned9'] = X['Capacity_binned'].apply(lambda x:1 if x>=9 else 0)
X['Capacity_binned10'] = X['Capacity_binned'].apply(lambda x:1 if x>=10 else 0)
X['Capacity_binned11'] = X['Capacity_binned'].apply(lambda x:1 if x>=11 else 0)
X['Capacity_binned12'] = X['Capacity_binned'].apply(lambda x:1 if x>=12 else 0)

X['Pixel_Density_binned1'] = X['Pixel_Density_binned'].apply(lambda x:1 if x>=1 else 0)
X['Pixel_Density_binned2'] = X['Pixel_Density_binned'].apply(lambda x:1 if x>=2 else 0)
X['Pixel_Density_binned3'] = X['Pixel_Density_binned'].apply(lambda x:1 if x>=3 else 0)
X['Pixel_Density_binned4'] = X['Pixel_Density_binned'].apply(lambda x:1 if x>=4 else 0)
X['Pixel_Density_binned5'] = X['Pixel_Density_binned'].apply(lambda x:1 if x>=5 else 0)

X['Resolution_binned1'] = X['Resolution_binned'].apply(lambda x:1 if x>=1 else 0)
X['Resolution_binned2'] = X['Resolution_binned'].apply(lambda x:1 if x>=2 else 0)
X['Resolution_binned3'] = X['Resolution_binned'].apply(lambda x:1 if x>=3 else 0)

X['Screen_Size_binned0'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=0 else 0)
X['Screen_Size_binned1'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=1 else 0)
X['Screen_Size_binned2'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=2 else 0)
X['Screen_Size_binned3'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=3 else 0)
X['Screen_Size_binned4'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=4 else 0)
X['Screen_Size_binned5'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=5 else 0)
X['Screen_Size_binned6'] = X['Screen_Size_binned'].apply(lambda x:1 if x>=6 else 0)

X['Processor_frequency_binned0'] = X['Processor_frequency_binned'].apply(lambda x:1 if x>=0 else 0)
X['Processor_frequency_binned1'] = X['Processor_frequency_binned'].apply(lambda x:1 if x>=1 else 0)
X['Processor_frequency_binned2'] = X['Processor_frequency_binned'].apply(lambda x:1 if x>=2 else 0)
'''

"\nX['SBRC_binned2'] = X['SBRC_binned'].apply(lambda x:1 if x>=2 else 0)\nX['SBRC_binned3'] = X['SBRC_binned'].apply(lambda x:1 if x>=3 else 0)\nX['SBRC_binned4'] = X['SBRC_binned'].apply(lambda x:1 if x>=4 else 0)\nX['SBRC_binned5'] = X['SBRC_binned'].apply(lambda x:1 if x>=5 else 0)\nX['SBRC_binned6'] = X['SBRC_binned'].apply(lambda x:1 if x>=6 else 0)\nX['SBRC_binned7'] = X['SBRC_binned'].apply(lambda x:1 if x>=7 else 0)\nX['SBRC_binned8'] = X['SBRC_binned'].apply(lambda x:1 if x>=8 else 0)\n\n\nX['Capacity_binned1'] = X['Capacity_binned'].apply(lambda x:1 if x>=1 else 0)\nX['Capacity_binned2'] = X['Capacity_binned'].apply(lambda x:1 if x>=2 else 0)\nX['Capacity_binned3'] = X['Capacity_binned'].apply(lambda x:1 if x>=3 else 0)\nX['Capacity_binned4'] = X['Capacity_binned'].apply(lambda x:1 if x>=4 else 0)\nX['Capacity_binned5'] = X['Capacity_binned'].apply(lambda x:1 if x>=5 else 0)\nX['Capacity_binned6'] = X['Capacity_binned'].apply(lambda x:1 if x>=6 else 0)\nX['Capacity_binned7'] 

In [23]:
X.groupby('Screen to Body Ratio (calculated)').mean()

Unnamed: 0_level_0,PhoneId,Screen Size,Internal Memory,Processor_frequency,Resolution,Height,RAM,Capacity,Weight,Pixel Density,...,SIM 2_Other,Num_cores_312,Num_cores_Deca,Num_cores_Dual,Num_cores_Hexa,Num_cores_Octa,Num_cores_Other,Num_cores_Quad,Num_cores_Tru-Octa,Rating
Screen to Body Ratio (calculated),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
23.60,97.0,2.40,4.0,1.10,8.0,134.0,0.0,2000.0,58.0,167.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
28.26,75.0,2.45,4.0,1.10,2.0,133.4,0.0,1500.0,117.0,163.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
29.59,247.0,2.40,32.0,1.79,2.0,117.5,32.0,1200.0,91.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
30.11,163.5,2.40,32.0,1.79,8.0,118.0,9.5,1020.0,121.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
30.26,168.0,2.40,0.0,1.79,2.0,115.6,3.0,1200.0,161.0,167.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85.05,76.0,6.40,128.0,2.00,25.0,157.5,8.0,3500.0,182.0,403.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
85.43,104.5,6.41,128.0,2.80,16.0,157.5,7.0,3700.0,185.0,402.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
86.29,73.0,6.40,256.0,2.65,25.0,156.7,8.0,3730.0,186.0,403.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
86.47,42.0,6.59,128.0,2.80,8.0,162.0,8.0,4000.0,199.0,388.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [24]:
test_id = test_new['PhoneId']
H = test_new.drop('PhoneId',axis=1)

H['RAM_normalized'] = (H['RAM'] - H['RAM'].min()) / (H['RAM'].max() - H['RAM'].min())
H['RAM1'] = H['RAM'].apply(lambda x:1 if x>=0 else 0)
H['RAM2'] = H['RAM'].apply(lambda x:1 if x>=0.1 else 0)
H['RAM3'] = H['RAM'].apply(lambda x:1 if x>=0.2 else 0)
H['RAM4'] = H['RAM'].apply(lambda x:1 if x>=0.3 else 0)
H['RAM5'] = H['RAM'].apply(lambda x:1 if x>=0.4 else 0)
H['RAM6'] = H['RAM'].apply(lambda x:1 if x>=0.5 else 0)
H['RAM7'] = H['RAM'].apply(lambda x:1 if x>=0.6 else 0)
H['RAM8'] = H['RAM'].apply(lambda x:1 if x>=0.7 else 0)
H['RAM9'] = H['RAM'].apply(lambda x:1 if x>=0.8 else 0)
H['RAM10'] = H['RAM'].apply(lambda x:1 if x>=0.9 else 0)

#X_train['Internal Memory'].replace(0,0,inplace = True)
#X_train['Internal Memory'].replace(4,0,inplace = True)
#X_train['Internal Memory'].replace(8,3,inplace = True)
#X_train['Internal Memory'].replace(16,7,inplace = True)
#X_train['Internal Memory'].replace(32,7,inplace = True)
#X_train['Internal Memory'].replace(64,8,inplace = True)
#X_train['Internal Memory'].replace(128,9,inplace = True)
#X_train['Internal Memory'].replace(256,10,inplace = True)
#X_train['Internal Memory'].replace(512,10,inplace = True)

H['Internal_Memory_normalized'] = (H['Internal Memory'] - H['Internal Memory'].min()) / (H['Internal Memory'].max() - H['Internal Memory'].min())
H['Internal Memory1'] = H['Internal Memory'].apply(lambda x:1 if x>=0 else 0)
H['Internal Memory2'] = H['Internal Memory'].apply(lambda x:1 if x>=0.1 else 0)
H['Internal Memory3'] = H['Internal Memory'].apply(lambda x:1 if x>=0.2 else 0)
H['Internal Memory4'] = H['Internal Memory'].apply(lambda x:1 if x>=0.3 else 0)
H['Internal Memory5'] = H['Internal Memory'].apply(lambda x:1 if x>=0.4 else 0)
H['Internal Memory6'] = H['Internal Memory'].apply(lambda x:1 if x>=0.5 else 0)
H['Internal Memory7'] = H['Internal Memory'].apply(lambda x:1 if x>=0.6 else 0)
H['Internal Memory8'] = H['Internal Memory'].apply(lambda x:1 if x>=0.7 else 0)
H['Internal Memory9'] = H['Internal Memory'].apply(lambda x:1 if x>=0.8 else 0)
H['Internal Memory10'] = H['Internal Memory'].apply(lambda x:1 if x>=0.9 else 0)

#H['Height_normalized'] = (H['Height'] - H['Height'].min()) / (H['Height'].max() - H['Height'].min())
#H['Height_normalized1'] = H['Height_normalized'].apply(lambda x:1 if x>=0 else 0)
#H['Height_normalized2'] = H['Height_normalized'].apply(lambda x:1 if x>=0.1 else 0)
#H['Height_normalized3'] = H['Height_normalized'].apply(lambda x:1 if x>=0.2 else 0)
#H['Height_normalized4'] = H['Height_normalized'].apply(lambda x:1 if x>=0.3 else 0)
#H['Height_normalized5'] = H['Height_normalized'].apply(lambda x:1 if x>=0.4 else 0)
#H['Height_normalized6'] = H['Height_normalized'].apply(lambda x:1 if x>=0.5 else 0)
#H['Height_normalized7'] = H['Height_normalized'].apply(lambda x:1 if x>=0.6 else 0)
#H['Height_normalized8'] = H['Height_normalized'].apply(lambda x:1 if x>=0.7 else 0)
#H['Height_normalized9'] = H['Height_normalized'].apply(lambda x:1 if x>=0.8 else 0)
#H['Height_normalized10'] = H['Height_normalized'].apply(lambda x:1 if x>=0.9 else 0)

H['Screen to Body Ratio (calculated)_normalized'] = (H['Screen to Body Ratio (calculated)'] - H['Screen to Body Ratio (calculated)'].min()) / (H['Screen to Body Ratio (calculated)'].max() - H['Screen to Body Ratio (calculated)'].min())
H['Screen to Body Ratio (calculated)_normalized1'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0 else 0)
H['Screen to Body Ratio (calculated)_normalized2'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.1 else 0)
H['Screen to Body Ratio (calculated)_normalized3'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.2 else 0)
H['Screen to Body Ratio (calculated)_normalized4'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.3 else 0)
H['Screen to Body Ratio (calculated)_normalized5'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.4 else 0)
H['Screen to Body Ratio (calculated)_normalized6'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.5 else 0)
H['Screen to Body Ratio (calculated)_normalized7'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.6 else 0)
H['Screen to Body Ratio (calculated)_normalized8'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.7 else 0)
H['Screen to Body Ratio (calculated)_normalized9'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.8 else 0)
H['Screen to Body Ratio (calculated)_normalized10'] = H['Screen to Body Ratio (calculated)_normalized'].apply(lambda x:1 if x>=0.9 else 0)


#X_train['Screen_Size_normalized'] = (X_train['Screen Size'] - X_train['Screen Size'].min()) / (X_train['Screen Size'].max() - X_train['Screen Size'].min())
#bins = [0,0.99,1.99,2.99,3.99,4.99,5.99,6.99,7.99,8.99,9.99]
#labels = [0,1,2,3,4,5,6,7,8,9]
#X['SBRC_binned'] = pd.cut(X['Screen_Size_normalized'], bins=bins, labels=labels)
#X.head()

#X_train['Screen to Body Ratio (calculated)_normalized'] = (X_train['Screen to Body Ratio (calculated)'] - X_train['Screen to Body Ratio (calculated)'].min()) / (X_train['Screen to Body Ratio (calculated)'].max() - X_train['Screen to Body Ratio (calculated)'].min())

#H['log_Weight']=(H['Weight']+1).transform(np.log)
#H['log_Pixel Density']=(H['Pixel Density']+1).transform(np.log)
#H['log_Screen Size']=(H['Screen Size']+1).transform(np.log)
#H['log_Capacity']=(H['Capacity']+1).transform(np.log)
#H['log_Resolution']=(H['Resolution']+1).transform(np.log)
#H['log_Processor_frequency']=(H['Processor_frequency']+1).transform(np.log)


#HH = H.drop('Weight',axis=1)
#HH = HH.drop('Screen Size',axis=1)
HH = H.drop('RAM',axis=1)
#HH = HH.drop('Height',axis=1)
HH = HH.drop('Internal Memory',axis=1)
#HH = HH.drop('Height_normalized',axis=1)
#HH = HH.drop('Pixel Density',axis=1)
#HH = HH.drop('Capacity',axis=1)
HH = HH.drop('Screen to Body Ratio (calculated)',axis=1)
HH = HH.drop('Screen to Body Ratio (calculated)_normalized',axis=1)
HH = HH.drop('RAM_normalized',axis=1)
HH = HH.drop('Internal_Memory_normalized',axis=1)
#HH = HH.drop('Processor_frequency',axis=1)
#HH = HH.drop('Resolution',axis=1)
H = HH.copy()
G = H.apply(pd.cut,bins = 2,labels =[0,1])
G = G.values
G.shape


(119, 113)

In [25]:
for b in range(X_binarized_train.shape[1] + 1):
  Y_pred_train = []
  accurate_rows = 0

  for x, y in zip(X_binarized_train, Y_train):
    y_pred = (np.sum(x) >= b)
    Y_pred_train.append(y_pred)
    accurate_rows += (y == y_pred)

  print(b, accurate_rows/X_binarized_train.shape[0])

0 0.6979472140762464
1 0.6979472140762464
2 0.6979472140762464
3 0.6979472140762464
4 0.6979472140762464
5 0.6979472140762464
6 0.6979472140762464
7 0.6979472140762464
8 0.6979472140762464
9 0.6979472140762464
10 0.6979472140762464
11 0.6979472140762464
12 0.6979472140762464
13 0.6979472140762464
14 0.6979472140762464
15 0.6979472140762464
16 0.7038123167155426
17 0.7067448680351907
18 0.7067448680351907
19 0.7067448680351907
20 0.7067448680351907
21 0.7126099706744868
22 0.7126099706744868
23 0.7126099706744868
24 0.7155425219941349
25 0.7126099706744868
26 0.6979472140762464
27 0.6979472140762464
28 0.6979472140762464
29 0.6979472140762464
30 0.7096774193548387
31 0.7126099706744868
32 0.6891495601173021
33 0.6803519061583577
34 0.6422287390029325
35 0.5777126099706745
36 0.5278592375366569
37 0.45161290322580644
38 0.3870967741935484
39 0.3020527859237537
40 0.3020527859237537
41 0.3020527859237537
42 0.3020527859237537
43 0.3020527859237537
44 0.3020527859237537
45 0.30205278592375

In [26]:
print(Y_pred_train)

[False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False, False

In [27]:
class MPNeuron:
  
  def __init__(self):
    self.b = None
    
  def model(self, x):
    return(sum(x) >= self.b)
  
  def predict(self, X):
    Y = []
    for x in X:
      result = self.model(x)
      Y.append(result)
    return np.array(Y)
  
  def fit(self, X, Y):
    accuracy = {}
    
    for b in range(X.shape[1] + 1):
      self.b = b
      Y_pred = self.predict(X)
      accuracy[b] = accuracy_score(Y_pred, Y)
      
    best_b = max(accuracy, key = accuracy.get)
    self.b = best_b
    
    print('Optimal value of b is', best_b)
    print('Highest accuracy is', accuracy[best_b])

In [28]:
mp_neuron = MPNeuron()
mp_neuron.fit(X_binarized_train, Y_train)

Optimal value of b is 24
Highest accuracy is 0.7155425219941349


In [29]:
b = 24
print(b)
#print(G)
Y_pred_test = []

for x in G:
  y_pred = (np.sum(x) >= b)
  Y_pred_test.append(y_pred)

dft = pd.DataFrame(Y_pred_test)
d = pd.concat([test_id,dft],axis=1)
d.columns = ['PhoneId','Class']
d['Class'].replace(True,'1',inplace = True)
d['Class'].replace(False,'0',inplace = True)

24


In [30]:
''''bins = [20,29.99,39.99,49.99,59.99,69.99,79.99,90]
labels = [2,3,4,5,6,7,8]
H['SBRC_binned'] = pd.cut(H['Screen to Body Ratio (calculated)'], bins=bins, labels=labels)

bins = [1000,1999,2999,3999,4999,5999,6999,7999,8999,9999,10999,11999,13000]
labels = [1,2,3,4,5,6,7,8,9,10,11,12]
H['Capacity_binned'] = pd.cut(H['Capacity'], bins=bins, labels=labels)

bins = [100,199,299,399,499,599]
labels = [1,2,3,4,5]
H['Pixel_Density_binned'] = pd.cut(H['Pixel Density'], bins=bins, labels=labels)

bins = [0,9,19,29]
labels = [1,2,3]
H['Resolution_binned'] = pd.cut(H['Resolution'], bins=bins, labels=labels)

bins = [0,0.99,1.99,2.99,3.99,4.99,5.99,6.99]
labels = [0,1,2,3,4,5,6]
H['Screen_Size_binned'] = pd.cut(H['Screen Size'], bins=bins, labels=labels)

bins = [0,0.99,1.99,2.99]
labels = [0,1,2]
H['Processor_frequency_binned'] = pd.cut(H['Processor_frequency'], bins=bins, labels=labels)




H['RAM1'] = H['RAM'].apply(lambda x:1 if x>=1 else 0)
H['RAM2'] = H['RAM'].apply(lambda x:1 if x>=2 else 0)
H['RAM3'] = H['RAM'].apply(lambda x:1 if x>=3 else 0)
H['RAM4'] = H['RAM'].apply(lambda x:1 if x>=4 else 0)
H['RAM5'] = H['RAM'].apply(lambda x:1 if x>=5 else 0)
H['RAM6'] = H['RAM'].apply(lambda x:1 if x>=6 else 0)
H['RAM7'] = H['RAM'].apply(lambda x:1 if x>=7 else 0)
H['RAM8'] = H['RAM'].apply(lambda x:1 if x>=8 else 0)
H['RAM9'] = H['RAM'].apply(lambda x:1 if x>=9 else 0)
H['RAM10'] = H['RAM'].apply(lambda x:1 if x>=10 else 0)

H['Internal Memory1'] = H['Internal Memory'].apply(lambda x:1 if x>=1 else 0)
H['Internal Memory2'] = H['Internal Memory'].apply(lambda x:1 if x>=2 else 0)
H['Internal Memory3'] = H['Internal Memory'].apply(lambda x:1 if x>=3 else 0)
H['Internal Memory4'] = H['Internal Memory'].apply(lambda x:1 if x>=4 else 0)
H['Internal Memory5'] = H['Internal Memory'].apply(lambda x:1 if x>=5 else 0)
H['Internal Memory6'] = H['Internal Memory'].apply(lambda x:1 if x>=6 else 0)
H['Internal Memory7'] = H['Internal Memory'].apply(lambda x:1 if x>=7 else 0)
H['Internal Memory8'] = H['Internal Memory'].apply(lambda x:1 if x>=8 else 0)
H['Internal Memory9'] = H['Internal Memory'].apply(lambda x:1 if x>=9 else 0)
H['Internal Memory10'] = H['Internal Memory'].apply(lambda x:1 if x>=10 else 0)

H['SBRC_binned2'] = H['SBRC_binned'].apply(lambda x:1 if x>=2 else 0)
H['SBRC_binned3'] = H['SBRC_binned'].apply(lambda x:1 if x>=3 else 0)
H['SBRC_binned4'] = H['SBRC_binned'].apply(lambda x:1 if x>=4 else 0)
H['SBRC_binned5'] = H['SBRC_binned'].apply(lambda x:1 if x>=5 else 0)
H['SBRC_binned6'] = H['SBRC_binned'].apply(lambda x:1 if x>=6 else 0)
H['SBRC_binned7'] = H['SBRC_binned'].apply(lambda x:1 if x>=7 else 0)
H['SBRC_binned8'] = H['SBRC_binned'].apply(lambda x:1 if x>=8 else 0)

H['Capacity_binned1'] = H['Capacity_binned'].apply(lambda x:1 if x>=1 else 0)
H['Capacity_binned2'] = H['Capacity_binned'].apply(lambda x:1 if x>=2 else 0)
H['Capacity_binned3'] = H['Capacity_binned'].apply(lambda x:1 if x>=3 else 0)
H['Capacity_binned4'] = H['Capacity_binned'].apply(lambda x:1 if x>=4 else 0)
H['Capacity_binned5'] = H['Capacity_binned'].apply(lambda x:1 if x>=5 else 0)
H['Capacity_binned6'] = H['Capacity_binned'].apply(lambda x:1 if x>=6 else 0)
H['Capacity_binned7'] = H['Capacity_binned'].apply(lambda x:1 if x>=7 else 0)
H['Capacity_binned8'] = H['Capacity_binned'].apply(lambda x:1 if x>=8 else 0)
H['Capacity_binned9'] = H['Capacity_binned'].apply(lambda x:1 if x>=9 else 0)
H['Capacity_binned10'] = H['Capacity_binned'].apply(lambda x:1 if x>=10 else 0)
H['Capacity_binned11'] = H['Capacity_binned'].apply(lambda x:1 if x>=11 else 0)
H['Capacity_binned12'] = H['Capacity_binned'].apply(lambda x:1 if x>=12 else 0)

H['Pixel_Density_binned1'] = H['Pixel_Density_binned'].apply(lambda x:1 if x>=1 else 0)
H['Pixel_Density_binned2'] = H['Pixel_Density_binned'].apply(lambda x:1 if x>=2 else 0)
H['Pixel_Density_binned3'] = H['Pixel_Density_binned'].apply(lambda x:1 if x>=3 else 0)
H['Pixel_Density_binned4'] = H['Pixel_Density_binned'].apply(lambda x:1 if x>=4 else 0)
H['Pixel_Density_binned5'] = H['Pixel_Density_binned'].apply(lambda x:1 if x>=5 else 0)

H['Resolution_binned1'] = H['Resolution_binned'].apply(lambda x:1 if x>=1 else 0)
H['Resolution_binned2'] = H['Resolution_binned'].apply(lambda x:1 if x>=2 else 0)
H['Resolution_binned3'] = H['Resolution_binned'].apply(lambda x:1 if x>=3 else 0)

H['Screen_Size_binned0'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=0 else 0)
H['Screen_Size_binned1'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=1 else 0)
H['Screen_Size_binned2'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=2 else 0)
H['Screen_Size_binned3'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=3 else 0)
H['Screen_Size_binned4'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=4 else 0)
H['Screen_Size_binned5'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=5 else 0)
H['Screen_Size_binned6'] = H['Screen_Size_binned'].apply(lambda x:1 if x>=6 else 0)

H['Processor_frequency_binned0'] = H['Processor_frequency_binned'].apply(lambda x:1 if x>=0 else 0)
H['Processor_frequency_binned1'] = H['Processor_frequency_binned'].apply(lambda x:1 if x>=1 else 0)
H['Processor_frequency_binned2'] = H['Processor_frequency_binned'].apply(lambda x:1 if x>=2 else 0)
'''

#H['log_Height']=(H['Height']+1).transform(np.log)
#H['log_Weight']=(H['Weight']+1).transform(np.log)
#H['log_Pixel Density']=(H['Pixel Density']+1).transform(np.log)
#H['log_Processor_frequency']=(H['Processor_frequency']+1).transform(np.log)
#H['log_Screen Size']=(H['Screen Size']+1).transform(np.log)
#H['log_Resolution']=(H['Resolution']+1).transform(np.log)
#H['log_Capacity']=(H['Capacity']+1).transform(np.log)
#H['log_Screen to Body Ratio (calculated)']=(H['Screen to Body Ratio (calculated)']+1).transform(np.log)

#HH = H.drop('Height',axis=1)
#HH = HH.drop('Weight',axis=1)
#HH = HH.drop('Pixel Density',axis=1)
#HH = HH.drop('Processor_frequency',axis=1)
#HH = HH.drop('Screen Size',axis=1)
#HH = HH.drop('Resolution',axis=1)
#HH = HH.drop('Capacity',axis=1)
#HH = HH.drop('Screen to Body Ratio (calculated)',axis=1)

'''
H1 = H.drop('RAM',axis=1)
H2 = H1.drop('Internal Memory',axis=1)
H3 = H2.drop('Screen to Body Ratio (calculated)',axis=1)
H4 = H3.drop('Capacity',axis=1)
H5 = H4.drop('Pixel Density',axis=1)
H6 = H5.drop('Resolution',axis=1)
H7 = H6.drop('Screen Size',axis=1)
H8 = H7.drop('Processor_frequency',axis=1)
H9 = H8.drop('SBRC_binned',axis=1)
H10 = H9.drop('Capacity_binned',axis=1)
H11 = H10.drop('Pixel_Density_binned',axis=1)
H12 = H11.drop('Resolution_binned',axis=1)
H13 = H12.drop('Screen_Size_binned',axis=1)
H14 = H13.drop('Processor_frequency_binned',axis=1)
H = H14.copy()

G = H.apply(pd.cut,bins = 2,labels =[0,1])
G = G.values

'''


"\nH1 = H.drop('RAM',axis=1)\nH2 = H1.drop('Internal Memory',axis=1)\nH3 = H2.drop('Screen to Body Ratio (calculated)',axis=1)\nH4 = H3.drop('Capacity',axis=1)\nH5 = H4.drop('Pixel Density',axis=1)\nH6 = H5.drop('Resolution',axis=1)\nH7 = H6.drop('Screen Size',axis=1)\nH8 = H7.drop('Processor_frequency',axis=1)\nH9 = H8.drop('SBRC_binned',axis=1)\nH10 = H9.drop('Capacity_binned',axis=1)\nH11 = H10.drop('Pixel_Density_binned',axis=1)\nH12 = H11.drop('Resolution_binned',axis=1)\nH13 = H12.drop('Screen_Size_binned',axis=1)\nH14 = H13.drop('Processor_frequency_binned',axis=1)\nH = H14.copy()\n\nG = H.apply(pd.cut,bins = 2,labels =[0,1])\nG = G.values\n\n"

In [31]:
d = d[['PhoneId', 'Class']]
d.groupby('Class').mean()

Unnamed: 0_level_0,PhoneId
Class,Unnamed: 1_level_1
0,304.3
1,234.293578


In [32]:
d.to_csv("submission.csv", index=False)