In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error, log_loss
import operator
import json
from IPython import display
import os
import warnings

np.random.seed(0)
warnings.filterwarnings("ignore")
THRESHOLD = 4

**Task: To predict whether the user likes the mobile phone or not.** 
Assumption: If the average rating of mobile >= threshold, then the user likes it, otherwise no




===============================================================================


Missing values:
'Also Known As 'Bezel-less display'(266),'Browser'(449),'Build Material'(338),'Co-Processor'(451),'Display Colour'(457),'Mobile High-Definition Link(MHL)'(472),'Music'(447) 'Email','Fingerprint Sensor Position'(174),'Games'(446),'HDMI'(454),'Heart Rate Monitor'(467),'IRIS Scanner'(467), 'Optical Image Stabilisation'(219),'Other Facilities'(444),'Phone Book'(444),'Physical Aperture'(87),'Quick Charging'(122),'Ring Tone'(444),'Ruggedness'(430),SAR Value(315),'SIM 3'(472),'SMS'(470)', 'Screen Protection'(229),'Screen to Body Ratio (claimed by the brand)'(428),'Sensor'(242),'Software Based Aperture'(473), 'Special Features'(459),'Standby time'(334),'Stylus'(473),'TalkTime'(259), 'USB Type-C'(374),'Video Player'(456), 'Video Recording Features'(458),'Waterproof'(398),'Wireless Charging','USB OTG Support'(159)'

Very low variance:
'Architecture'(most entries are 64-bit),'Audio Jack','GPS','Loudspeaker','Network','Network Support','Other Sensors'(28),'SIM Size', 'VoLTE'

Multivalued:
'Colours','Custom UI','Model'(1),'Other Sensors','Launch Date'

Not important:
'Bluetooth', 'Settings'(75),'Wi-Fi','Wi-Fi Features'

Doubtful:
'Aspect Ratio','Autofocus','Brand','Camera Features','Fingerprint Sensor'(very few entries are missing), 'Fingerprint Sensor Position', 'Graphics'(multivalued),'Image resolution'(multivalued),'SIM Size','Sim Slot(s)', 'User Available Storage', 'SIM 1', 'SIM 2','Shooting Modes', 'Touch Screen'(24), 'USB Connectivity'

To check:
'Display Type','Expandable Memory','FM Radio'

High Correlation with other features
'SIM Slot(s)' high correlation with SIM1 'Weight' has high high correlation with capacity , screen-to-body ratio 'Height' - screen size is also there

Given a mobile, we can't directly get these features
'Rating Count', 'Review Count'

Keeping:
'Capacity','Flash'(17),'Height'(22),'Internal Memory'(20, require cleaning),'Operating System'(25, require cleaning), 'Pixel Density'(1, clean it),'Processor'(22, clean it), 'RAM'(17, clean), 'Rating','Resolution'(cleaning), 'Screen Resolution','Screen Size', 'Thickness'(22), 'Type','User Replaceable','Weight'(cleaning),'Sim Size'(), 'Other Sensors'(28), 'Screen to Body Ratio (calculated)','Width',

In [None]:
train = pd.read_csv("../input/train.csv") 
test = pd.read_csv("../input/test.csv")

# **check the number of features and data points in train**
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# **check the number of features and data points in test**
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

In [None]:
def data_clean(data):
    
    # **Let's first remove all missing value features**
    columns_to_remove = ['Also Known As','Bezel-less display'
                         'Browser','Build Material','Co-Processor','Browser'
                         'Display Colour','Mobile High-Definition Link(MHL)',
                         'Music', 'Email','Fingerprint Sensor Position',
                         'Games','HDMI','Heart Rate Monitor','IRIS Scanner', 
                         'Optical Image Stabilisation','Other Facilities',
                         'Phone Book','Physical Aperture','Quick Charging',
                         'Ring Tone','Ruggedness','SAR Value','SIM 3','SMS',
                         'Screen Protection','Screen to Body Ratio (claimed by the brand)',
                         'Sensor','Software Based Aperture', 'Special Features',
                         'Standby time','Stylus','TalkTime', 'USB Type-C',
                         'Video Player', 'Video Recording Features','Waterproof',
                         'Wireless Charging','USB OTG Support']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

                 #**Features having very low variance**

    columns_to_remove = ['Architecture','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

                        # **Multivalued:**
    columns_to_remove = ['Architecture','Launch Date','Audio Jack','GPS','Loudspeaker','Network','Network Support','VoLTE', 'Custom UI']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

                        # **Not much important**
    columns_to_remove = ['Bluetooth', 'Settings','Wi-Fi','Wi-Fi Features']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]
    
    return data

***Removing features**


In [None]:
train = data_clean(train)
test = data_clean(test)

removing all those data points in which more than 15 features are missing



In [None]:
train = train[(train.isnull().sum(axis=1) <= 15)]
#test = test[(test.isnull().sum(axis=1) <= 15)]

In [None]:
# check the number of features and data points remaining  in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points remaining in test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

***Filling Missing values***

In [None]:
 def for_integer(test):
    try:
        test = test.strip()
        return int(test.split(' ')[0])
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass

def for_string(test):
    try:
        test = test.strip()
        return (test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

def for_float(test):
    try:
        test = test.strip()
        return float(test.split(' ')[0])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

    
def for_Internal_Memory(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[1] == 'GB':
            return int(test[0])
        if test[1] == 'MB':
#             print("here")
            return (int(test[0]) * 0.001)
    except IOError:
           pass
    except ValueError:
        pass
    except:
        pass
    
def find_freq(test):
    try:
        test = test.strip()
        test = test.split(' ')
        if test[2][0] == '(':
            return float(test[2][1:])
        return float(test[2])
    except IOError:
        pass
    except ValueError:
        pass
    except:
        pass

In [None]:
def data_clean_2(x):
    data = x.copy()
    
    data['Capacity'] = data['Capacity'].apply(for_integer)

    data['Height'] = data['Height'].apply(for_float)
    data['Height'] = data['Height'].fillna(data['Height'].mean())

    data['Internal Memory'] = data['Internal Memory'].apply(for_Internal_Memory)

    data['Pixel Density'] = data['Pixel Density'].apply(for_integer)

    data['Internal Memory'] = data['Internal Memory'].fillna(data['Internal Memory'].median())
    data['Internal Memory'] = data['Internal Memory'].astype(int)

    data['RAM'] = data['RAM'].apply(for_integer)
    data['RAM'] = data['RAM'].fillna(data['RAM'].median())
    data['RAM'] = data['RAM'].astype(int)

    data['Resolution'] = data['Resolution'].apply(for_integer)
    data['Resolution'] = data['Resolution'].fillna(data['Resolution'].median())
    data['Resolution'] = data['Resolution'].astype(int)

    data['Screen Size'] = data['Screen Size'].apply(for_float)

    data['Thickness'] = data['Thickness'].apply(for_float)
    data['Thickness'] = data['Thickness'].fillna(data['Thickness'].mean())
    data['Thickness'] = data['Thickness'].round(2)

    data['Type'] = data['Type'].fillna('Li-Polymer')

    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].apply(for_float)
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].fillna(data['Screen to Body Ratio (calculated)'].mean())
    data['Screen to Body Ratio (calculated)'] = data['Screen to Body Ratio (calculated)'].round(2)

    data['Width'] = data['Width'].apply(for_float)
    data['Width'] = data['Width'].fillna(data['Width'].mean())
    data['Width'] = data['Width'].round(2)

    data['Flash'][data['Flash'].isna() == True] = "Other"

    data['User Replaceable'][data['User Replaceable'].isna() == True] = "Other"

    data['Num_cores'] = data['Processor'].apply(for_string)
    data['Num_cores'][data['Num_cores'].isna() == True] = "Other"


    data['Processor_frequency'] = data['Processor'].apply(find_freq)
    #because there is one entry with 208MHz values, to convert it to GHz
    data['Processor_frequency'][data['Processor_frequency'] > 200] = 0.208
    data['Processor_frequency'] = data['Processor_frequency'].fillna(data['Processor_frequency'].mean())
    data['Processor_frequency'] = data['Processor_frequency'].round(2)

    data['Camera Features'][data['Camera Features'].isna() == True] = "Other"

    #simplifyig Operating System to os_name for simplicity
    data['os_name'] = data['Operating System'].apply(for_string)
    data['os_name'][data['os_name'].isna() == True] = "Other"

    data['Sim1'] = data['SIM 1'].apply(for_string)

    data['SIM Size'][data['SIM Size'].isna() == True] = "Other"

    data['Image Resolution'][data['Image Resolution'].isna() == True] = "Other"

    data['Fingerprint Sensor'][data['Fingerprint Sensor'].isna() == True] = "Other"

    data['Expandable Memory'][data['Expandable Memory'].isna() == True] = "No"

    data['Weight'] = data['Weight'].apply(for_integer)
    data['Weight'] = data['Weight'].fillna(data['Weight'].mean())
    data['Weight'] = data['Weight'].astype(int)

    data['SIM 2'] = data['SIM 2'].apply(for_string)
    data['SIM 2'][data['SIM 2'].isna() == True] = "Other"
    
    return data

In [None]:
train = data_clean_2(train)
test = data_clean_2(test)

# **check the number of features and data points remaining in train**
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# **check the number of features and data points remaining in test**
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

****Not very important feature****

In [None]:
def data_clean_3(x):
    
    data = x.copy()

    columns_to_remove = ['User Available Storage','SIM Size','Chipset','Processor','Autofocus','Aspect Ratio','Touch Screen',
                        'Bezel-less display','Operating System','SIM 1','USB Connectivity','Other Sensors','Graphics','FM Radio',
                        'NFC','Shooting Modes','Display Colour' ]

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = [ 'Screen Resolution','Camera Features',
                        'Thickness', 'Display Type']

    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]


    columns_to_remove = ['Fingerprint Sensor', 'Flash', 'Rating Count', 'Review Count','Image Resolution','Type','Expandable Memory',\
                        'Colours','Model']
    columns_to_retain = list(set(data.columns)-set(columns_to_remove))
    data = data[columns_to_retain]

    return data


In [None]:
train = data_clean_3(train)
test = data_clean_3(test)

# check the number of features and data points remaining  in train
print("Number of data points in train: %d" % train.shape[0])
print("Number of features in train: %d" % train.shape[1])

# check the number of features and data points remaingin test
print("Number of data points in test: %d" % test.shape[0])
print("Number of features in test: %d" % test.shape[1])

In [None]:
# **one hot encoding**

train_ids = train['PhoneId']
test_ids = test['PhoneId']

cols = list(test.columns)
cols.remove('PhoneId')
cols.insert(0, 'PhoneId')

combined = pd.concat([train.drop('Rating', axis=1)[cols], test[cols]])
print(combined.shape)
print(combined.columns)

combined = pd.get_dummies(combined)
print(combined.shape)
print(combined.columns)

train_new = combined[combined['PhoneId'].isin(train_ids)]
test_new = combined[combined['PhoneId'].isin(test_ids)]

In [None]:
train_new = train_new.merge(train[['PhoneId', 'Rating']], on='PhoneId')

In [None]:
# **check the number of features and data points remaining in train**
print("Number of data points in train: %d" % train_new.shape[0])
print("Number of features in train: %d" % train_new.shape[1])

# **check the number of features and data points remaining in test**
print("Number of data points in test: %d" % test_new.shape[0])
print("Number of features in test: %d" % test_new.shape[1])

**Check the shape of the test and train data**

In [None]:
# **Check the shape of the training data and testing data**
train_new.shape, test_new.shape

***Perform the head operation to get a feel for the test & train data and make basic observations
***

In [None]:
train_new.head()
# There is a PhoneID column,is an indicator column and does not play any role decision making.
# We see variables coded as dummy variables which is an important observation and will be 
# used later on in the model

In [None]:
test_new.head()
# There is a PhoneID column,is an indicator column and does not play any role decision making
# There is no Rating column in test_new since this you will have to predict by building a working model
# on the train_data

**Create staging DataFrames**


In [None]:
# Create X_train_new which will hold all columns except PhoneID and Rating using train_new
# Create Y_train_new which will only hold the Rating column present in train_new, note that the dataframe maintains integrity of the PhoneID 
# which is very essential


X_train_new = train_new.drop(['PhoneId','Rating'],axis=1)
Y_train_new_rating_discrete = train_new['Rating'] 
Y_train_new_rating_binary   = train_new['Rating'].map(lambda x: 1 if x >= 4 else 0)

X_test_new = test_new.drop(['PhoneId'],axis=1)

**Perform a decribe function on X_train_new and X_test_new to check std, mean and other parameters which will help you understand the data better in terms of dummy variables, null values, possible outliers etc.**

In [None]:
X_train_new.describe()

In [None]:
X_test_new.describe()

**Understanding Correlation**


In [None]:
# Checking for correlation only for the first 10 discrete variables
X_train_corr = X_train_new.iloc[:,[0,1,2,3,4,5,6,7,8,9]]
X_train_corr.head()

In [None]:
correlations = X_train_corr.corr().abs().unstack().sort_values(kind="quicksort").reset_index()
correlations = correlations[correlations['level_0'] != correlations['level_1']]

correlations.tail(10)

# The tail of the dataframe has the information about highest correlation between features,
# where as the head has details about features that have least correlation

# Observe that Screen Size and Screen to Body Ratio are highly correlated, We don't know if they are
# positively or negatively correlated yet. We will find that out visually
# Similarly Height and Screen Size are correlated.

# Intution tells me that they will be positvely correlated, meaning any increase in Height of the 
# phone will result in an increase of the Screen Size and vice versa. 

#A bigger phone screen (Screen Size) means the phone is lenghtier (Height). 

**Visual Interpretation of correlation
**

In [None]:
corr = X_train_new.corr()
fig = plt.figure()
fig.set_size_inches(20,20)
ax = fig.add_subplot(111)
cax = ax.matshow(corr,cmap='coolwarm', vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,len(X_train_new.columns),1)
ax.set_xticks(ticks)
plt.xticks(rotation=90)
ax.set_yticks(ticks)
ax.set_xticklabels(X_train_new.columns)
ax.set_yticklabels(X_train_new.columns)
plt.show()

# We can clearly see that Screen Size is positively correlated with Height and Screen to body Ratio
# There are other positively correlated variables too. Another good easy to understand example is 
# Brand_Apple and os_name_iOS.

# Look against the line of 'Brand_Apple' and compare it with all the top columns 
# It's obvious isn't it that an Apple iPhone / product user will have an iOS operating 
# system on his/her device. He/she cannot have an Android on his Apple iPhone. 
# This now introduces to you negative correlation. Notice how Brand_Apple and os_name_Andriod are
# negatively correlated (dark blue), which means Apple folks cannot have Android OS

 **plot some interesting graphs**


In [None]:
#Assigning a dataframe to capture visuals on the discrete data, note that how we have two dataframes where the Y_train_new has been desgined to 
#hold discrete value as well as binary value for the ease of interpretation
X_train_visual_rating_binary = pd.concat([X_train_new,Y_train_new_rating_binary],axis=1)
X_train_visual_rating_discrete = pd.concat([X_train_new,Y_train_new_rating_discrete],axis=1)

**RAM vs Capacity - Box Plot**

RAM vs Capacity - Box Plot
The plot is indicating me that phones with high RAM have larger Capacity values ( for the first few bars atleast ) or vice versa, may be the phones > 16GB have good design where capacity is optimized to a lower value but the trend remains the same. But did the users like the phone ?? Note that there is no correlation between RAM and Capacity so they are good variables/features to decide on if the user would have liked the phone or not.


In [None]:
plt.figure(figsize=(14,8))
sns.barplot(x='RAM',y='Capacity',data=X_train_visual_rating_binary)

**RAM vs Capacity - Scatter Plot**
I was not able to derive any inference from this plot for the RAM values of 2 GB to 64 GB as data is too much stacked. But it is very evident that folks with 512 units of RAM have disliked the phonE****


In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='RAM',y='Capacity',hue='Rating',data=X_train_visual_rating_binary)

**Screen Size vs Pixel Density - Box Plot**
The plot is indicating me that if Screen Size increases then Pixel Density also seems to be increasing ( atleast for majority of the cases ), but its very hard to draw a trend here. In reality we would not like the phone which has large screen size and low pixel density. 

In [None]:
plt.figure(figsize=(14,8))
plt.xticks(rotation='vertical')
sns.barplot(x='Screen Size',y='Pixel Density',data=X_train_visual_rating_binary)


**Resolution vs Pixel Density - Box Plot**
In reality high resoultion needs more pixles per square block. But it seems like Resolution 7, 13, 25 have similar Pixel Density. There is no linear trend observed. Correlation seems to be < 0.5

In [None]:
plt.figure(figsize=(14,8))
plt.xticks(rotation='vertical')
sns.barplot(x='Resolution',y='Pixel Density',data=X_train_visual_rating_binary)

Resolution vs Pixel Density - Scatter Plot

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x='Resolution',y='Pixel Density',hue='Rating',data=X_train_visual_rating_binary)

**Internal Memory vs Weight - Box Plot**

In [None]:
plt.figure(figsize=(14,8))
plt.xticks(rotation='vertical')
sns.barplot(x='Internal Memory',y='Weight',data=X_train_visual_rating_binary)


Capacity vs Weight - Bar Plot

In [None]:
plt.figure(figsize=(14,8))
plt.xticks(rotation='vertical')
sns.barplot(x='Capacity',y='Weight',data=X_train_visual_rating_binary)

In [None]:
import pandas as pd


In [None]:
submission = pd.DataFrame({'PhoneId':test_new['PhoneId'], 'Class':[1]*test_new.shape[0]})
submission = submission[['PhoneId', 'Class']]
submission.head()

In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
"""modelling code mp neuron and preception"""