In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [2]:
df=pd.read_csv('laptop_data_cleaned.csv')

In [3]:
df.sample()

Unnamed: 0,Company,TypeName,Ram,Weight,Price,TouchScreen,Ips,Ppi,Cpu_brand,HDD,SSD,Gpu_brand,Os
641,HP,Notebook,8,2.6,10.231119,0,0,107.985646,AMD Processor,1000,0,AMD,Windows


In [4]:
df.shape

(1273, 13)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1273 entries, 0 to 1272
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Company      1273 non-null   object 
 1   TypeName     1273 non-null   object 
 2   Ram          1273 non-null   int64  
 3   Weight       1273 non-null   float64
 4   Price        1273 non-null   float64
 5   TouchScreen  1273 non-null   int64  
 6   Ips          1273 non-null   int64  
 7   Ppi          1273 non-null   float64
 8   Cpu_brand    1273 non-null   object 
 9   HDD          1273 non-null   int64  
 10  SSD          1273 non-null   int64  
 11  Gpu_brand    1273 non-null   object 
 12  Os           1273 non-null   object 
dtypes: float64(3), int64(5), object(5)
memory usage: 129.4+ KB


In [6]:
value=df['Company'].value_counts()
value_repalce=value[value<8].index
df['Company']=df['Company'].replace(value_repalce,'Other')

In [1]:
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,BaggingRegressor,ExtraTreesRegressor,GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder,StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression,ridge_regression,SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor


In [15]:
x=df.drop('Price',axis=1)
y=df['Price']


In [18]:
X_train,X_test,y_train,y_test=train_test_split(x,y ,test_size=0.2, random_state=42)

In [21]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((1018, 12), (255, 12), (1018,), (255,))

In [26]:
cat_data=['Company','TypeName','Cpu_brand','Gpu_brand','Os']
num_data=['Ram','Weight','TouchScreen','Ips','Ppi','HDD','SSD']

In [30]:
preprocessor=ColumnTransformer([('num',StandardScaler(),num_data),
                                           ('cat',OneHotEncoder(),cat_data)])

In [31]:
X_train_trans=preprocessor.fit_transform(X_train)
X_test_trans=preprocessor.transform(X_test)

In [73]:
rfr=RandomForestRegressor(max_depth=30,n_jobs=-1,max_features='sqrt', n_estimators=200)
rfr.fit(X_train_trans,y_train)

In [74]:
y_pred=rfr.predict(X_test_trans)

In [76]:
r2=r2_score(y_test,y_pred)
r2

0.8887148203443972

In [55]:
models=[RandomForestRegressor(),
        AdaBoostRegressor(),BaggingRegressor(),ExtraTreesRegressor(),GradientBoostingRegressor(),LinearRegression(),SGDRegressor(),SVR(),DecisionTreeRegressor()]

In [60]:
import time
Name=[]
r2=[]
Time_Taken=[]
for model in models:
    Name.append(type(model).__name__)
    begin=time.time()
    model.fit(X_train_trans,y_train)
    y_pred=model.predict(X_test_trans)
    end=time.time()
    score=r2_score(y_test,y_pred)
    r2.append(score)
    Time_Taken.append(end-begin)

In [61]:
data={'model':Name,
     'r2_score':r2,
     'time_':Time_Taken}
all_model=pd.DataFrame(data)

In [62]:
all_model

Unnamed: 0,model,r2_score,time_
0,RandomForestRegressor,0.887661,0.429931
1,AdaBoostRegressor,0.813149,0.104989
2,BaggingRegressor,0.865059,0.046471
3,ExtraTreesRegressor,0.868276,0.371701
4,GradientBoostingRegressor,0.866812,0.110322
5,LinearRegression,0.824983,0.043462
6,SGDRegressor,0.754844,0.006008
7,SVR,0.881848,0.073838
8,DecisionTreeRegressor,0.819841,0.004004


In [65]:
param_grid={
    'n_estimators':[50,100,200],
     'max_depth':[None,10,20,30],
'min_samples_leaf':[1,2,5],
'max_features':[None,'log2','sqrt'],
'max_leaf_nodes':[None,2,5,10,20],
  'n_jobs':[-1]
}

In [70]:
grid_search=GridSearchCV(estimator=rfr,param_grid=param_grid,cv=5,n_jobs=-1,verbose=2)

In [71]:
grid_search.fit(X_train_trans,y_train)

Fitting 5 folds for each of 540 candidates, totalling 2700 fits


In [72]:
grid_search.best_params_

{'max_depth': 30,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'n_estimators': 200,
 'n_jobs': -1}