# Laptop Prices Prediction
This analysis and model will be used to find correlation between specs and laptop price aswell as being used to predict laptop prices in the future.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Variable Identification
First I will explore each variable first, I want to find out the data type of each and how many null entries I have in the dataset.

In [None]:
laptops = pd.read_csv('/kaggle/input/laptop-price/laptop_price.csv',encoding='latin-1')
laptops = laptops.set_index('laptop_ID')
laptops.head()

Wow! Not one missing entry! Don't you love it when this happens.

In [None]:
laptops.info()

In [None]:
laptops.describe()

## Univariate Analysis
Now I will visualize some features to try and find some outliers and see if we can find some interesting stats.

In [None]:
laptops['Company'].value_counts()
fig_dims = (20, 6)
fig, ax = plt.subplots(figsize=fig_dims)
sb.countplot(x="Company", data=laptops, ax=ax)

In [None]:
import seaborn as sns

In [None]:
p = laptops.groupby('Company')['Price_euros'].mean().sort_values(ascending=False)
sns.barplot(p, p.index)

## Data Cleaning & Prep
Next, I will clean up all of the confusing categorical data

In [None]:
laptops["Ram"] = laptops["Ram"].str.replace('GB', '')
laptops["Weight"] = laptops["Weight"].str.replace('kg', '')
# Memory -- HDD SDD Hybrid Flash
laptops['Memory'] = laptops['Memory'].astype(str).replace('\.0', '', regex=True)
laptops["Memory"] = laptops["Memory"].str.replace('GB', '')
laptops["Memory"] = laptops["Memory"].str.replace('TB', '000')
new2 = laptops["Memory"].str.split("+", n = 1, expand = True)
laptops["first"]= new2[0]
laptops["first"]=laptops["first"].str.strip()
laptops["second"]= new2[1]
laptops["Layer1HDD"] = laptops["first"].apply(lambda x: 1 if "HDD" in x else 0)
laptops["Layer1SSD"] = laptops["first"].apply(lambda x: 1 if "SSD" in x else 0)
laptops["Layer1Hybrid"] = laptops["first"].apply(lambda x: 1 if "Hybrid" in x else 0)
laptops["Layer1Flash_Storage"] = laptops["first"].apply(lambda x: 1 if "Flash Storage" in x else 0)
laptops['first'] = laptops['first'].str.replace(r'\D', '')
laptops["second"].fillna("0", inplace = True)
laptops["Layer2HDD"] = laptops["second"].apply(lambda x: 1 if "HDD" in x else 0)
laptops["Layer2SSD"] = laptops["second"].apply(lambda x: 1 if "SSD" in x else 0)
laptops["Layer2Hybrid"] = laptops["second"].apply(lambda x: 1 if "Hybrid" in x else 0)
laptops["Layer2Flash_Storage"] = laptops["second"].apply(lambda x: 1 if "Flash Storage" in x else 0)
laptops['second'] = laptops['second'].str.replace(r'\D', '')
laptops["first"] = laptops["first"].astype(int)
laptops["second"] = laptops["second"].astype(int)
laptops["Total_Memory"]=(laptops["first"]*(laptops["Layer1HDD"]+laptops["Layer1SSD"]+laptops["Layer1Hybrid"]+laptops["Layer1Flash_Storage"])+laptops["second"]*(laptops["Layer2HDD"]+laptops["Layer2SSD"]+laptops["Layer2Hybrid"]+laptops["Layer2Flash_Storage"]))
laptops["Memory"]=laptops["Total_Memory"]
laptops["HDD"]=(laptops["first"]*laptops["Layer1HDD"]+laptops["second"]*laptops["Layer2HDD"])
laptops["SSD"]=(laptops["first"]*laptops["Layer1SSD"]+laptops["second"]*laptops["Layer2SSD"])
laptops["Hybrid"]=(laptops["first"]*laptops["Layer1Hybrid"]+laptops["second"]*laptops["Layer2Hybrid"])
laptops["Flash_Storage"]=(laptops["first"]*laptops["Layer1Flash_Storage"]+laptops["second"]*laptops["Layer2Flash_Storage"])

# Screen Resolution
new = laptops["ScreenResolution"].str.split("x", n = 1, expand = True) 
laptops["X_res"]= new[0]
laptops["Y_res"]= new[1]
laptops["Y_res"]= pd.to_numeric(laptops["Y_res"])
laptops["Y_res"]= laptops["Y_res"].astype(float)
laptops["X_res"]=(laptops['X_res'].str.replace(',','').str.findall(r'(\d+\.?\d+)').apply(lambda x: pd.Series(x).astype(int)).mean(1))
laptops["X_res"]=pd.to_numeric(laptops["X_res"])
laptops["PPI"]=(((laptops["X_res"]**2+laptops["Y_res"]**2)**(1/2))/laptops["Inches"]).astype(float)
laptops["ScreenResolution"]=(laptops["X_res"]*laptops["Y_res"]).astype(float)
laptops["Ram"] = laptops["Ram"].astype(int)
laptops["Weight"] = laptops["Weight"].astype(float)
laptops=laptops.drop(['first','second','Layer1HDD','Layer1SSD','Layer1Hybrid','Layer1Flash_Storage','Layer2HDD','Layer2SSD','Layer2Hybrid','Layer2Flash_Storage','Total_Memory'],axis=1)

In [None]:
laptops.shape

In [None]:
laptops['Gpu_company'] = laptops.Gpu.apply(lambda x: x.split(' ')[0])

In [None]:
laptops.groupby('Gpu_company').Price_euros.max().sort_values(ascending=False)

In [None]:
laptops.groupby('Gpu_company').Price_euros.count().sort_values(ascending=False)

In [None]:
laptops['Gpu_company'] = laptops.Gpu.apply(lambda x: x.split(' ')[0])

In [None]:
laptops['Cpu'].value_counts()

In [None]:
laptops['core_i'] = laptops.Cpu.apply(lambda x: x.split(' ')[2][-1] if 'Core i' in x else 0).astype('int')

In [None]:
laptops['core_generation'] = laptops.Cpu.apply(lambda x: x.split(' ')[3][0] if 'Core i' in x else 0).astype('int')

In [None]:
laptops['core_generation'].value_counts()

In [None]:
laptops.groupby('Gpu_company').Price_euros.mean()

In [None]:
laptops['clock_speed'] = laptops.Cpu.apply(lambda x: x.split(' ')[-1]).str.replace('GHz','').astype('float')


## Bi-variate Analysis
Now I will compare features against each other to try and find some correlation between them.

In [None]:
def correlation_heatmap(train):
    correlations = train.corr()
    
    fig, ax = plt.subplots(figsize=(16,16))
    sb.heatmap(correlations, vmax=1.0, center=0, fmt='.2f', square=True, linewidths=.5, annot=True, cbar_kws={"shrink":.70})
    plt.show()
correlation_heatmap(laptops)

In [None]:
fig_dims = (20, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sb.scatterplot(data=laptops, x="Price_euros", y="Ram", ax=ax, s=75)

In [None]:
fig_dims = (20, 10)
fig, ax = plt.subplots(figsize=fig_dims)
sb.scatterplot(data=laptops, x="Price_euros", y="SSD", ax=ax, s=75)

## Split Current Data
Now I will split the target feature and from the dataset and sort out the current object features.

In [None]:
X = laptops.drop(['Price_euros'],axis=1)
Y = laptops['Price_euros'].values
X = X.select_dtypes(exclude=['object'])

In [None]:
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=42)

## Train Models
Now I will train a few models and compare them

In [None]:
SGDreg = SGDRegressor()
SGDreg.fit(X_train, y_train)

In [None]:
pred = SGDreg.predict(X_train)
sgd_mse = mean_squared_error(y_train, pred)
sgd_rmse = np.sqrt(sgd_mse)
sgd_rmse

In [None]:
pred = SGDreg.predict(X_train)
sgd_mse = mean_squared_error(y_train, pred)
sgd_rmse = np.sqrt(sgd_mse)
sgd_rmse

In [None]:
param_grid = {
    'alpha': 10.0 ** -np.arange(1, 7),
    'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling'],
    'max_iter': [1000, 5000, 10000]
}

grid_search = GridSearchCV(SGDreg, param_grid)
grid_search.fit(X_train, y_train)
print("Best score: " + str(grid_search.best_score_))

In [None]:
Kreg = KNeighborsRegressor()
Kreg.fit(X_train, y_train)

In [None]:
pred = Kreg.predict(X_train)
k_mse = mean_squared_error(y_train, pred)
k_rmse = np.sqrt(k_mse)
k_rmse

In [None]:
param_grid = {'n_neighbors': np.arange(1, 12, 2),
              'weights': ['uniform', 'distance']}
grid_search = GridSearchCV(Kreg, param_grid)
grid_search.fit(X_train, y_train)
print("Best score: " + str(grid_search.best_score_))

In [None]:
final_model = grid_search.best_estimator_
final_pred = final_model.predict(X_test)
final_pred = final_pred.tolist()
for pred in range(0, len(final_pred)):
    print("Predicition: " + str(round(final_pred[pred], 2)) + " Actual: " + str(y_test[pred]))

Now let's plot our predicted prices compared to the actual prices.

In [None]:
fig_dims = (20, 10)
fig, ax = plt.subplots(figsize=fig_dims)
ax.scatter(y_test, final_pred)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()