## K NEAREAST NEIGHBORS REGRESSOR ON 50_STARTUP DATASETS

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

### READ DATASET

In [2]:
import pandas as pd
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
RND,50.0,73721.6156,45902.256482,0.0,39936.37,73051.08,101602.8,165349.2
ADMIN,50.0,121344.6396,28017.802755,51283.14,103730.875,122699.795,144842.18,182645.56
MKT,50.0,211025.0978,122290.310726,0.0,129300.1325,212716.24,299469.085,471784.1
PROFIT,50.0,112012.6392,40306.180338,14681.4,90138.9025,107978.19,139765.9775,192261.83


### SEPERATE X AND Y FEATURE

In [6]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [7]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [8]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [9]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [10]:
cat

['STATE']

In [11]:
con

['RND', 'ADMIN', 'MKT']

### PREPROCESSING STEPS

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [14]:
num_pipe= Pipeline(steps=[('impute',SimpleImputer(strategy='median')),
                          ('scaler',StandardScaler())])
cat_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore'))])
pre=ColumnTransformer([('num',num_pipe,con),
                       ('cat',cat_pipe,cat)])

In [17]:
xpre= pre.fit_transform(X)
xpre[0:5]

array([[ 2.01641149,  0.56075291,  2.15394309,  0.        ,  0.        ,
         1.        ],
       [ 1.95586034,  1.08280658,  1.9236004 ,  1.        ,  0.        ,
         0.        ],
       [ 1.75436374, -0.72825703,  1.62652767,  0.        ,  1.        ,
         0.        ],
       [ 1.55478369, -0.09636463,  1.42221024,  0.        ,  0.        ,
         1.        ],
       [ 1.5049372 , -1.07991935,  1.28152771,  0.        ,  1.        ,
         0.        ]])

In [18]:
cols= pre.get_feature_names_out()
cols

array(['num__RND', 'num__ADMIN', 'num__MKT', 'cat__STATE_California',
       'cat__STATE_Florida', 'cat__STATE_New York'], dtype=object)

In [19]:
x_pre= pd.DataFrame(xpre,columns=cols)
x_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


### TRAIN TEST SPLIT

In [39]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x_pre,Y,test_size=0.2,random_state=21)

In [40]:
xtrain.shape

(40, 6)

In [41]:
xtest.shape

(10, 6)

### MODEL CREATION 

In [45]:
from sklearn.neighbors import KNeighborsRegressor
model= KNeighborsRegressor(n_neighbors=5)
model.fit(xtrain,ytrain)

In [46]:
model.score(xtrain,ytrain)

0.8086675332189734

In [47]:
model.score(xtest,ytest)

0.8445531645536952

### HYPERPARAMETER TUNNING FOR THE MODEL

In [97]:
params ={'n_neighbors':[2,3,4,5,6,7,8,9]}

In [98]:
from sklearn.model_selection import GridSearchCV
model1 = KNeighborsRegressor()
gscv= GridSearchCV(model1,param_grid=params,cv=5,scoring='f1_macro')
gscv.fit(xtrain,ytrain)

In [99]:
gscv.best_params_

{'n_neighbors': 2}

In [100]:
gscv.best_score_

nan

In [101]:
best_knn= gscv.best_estimator_
best_knn

In [102]:
best_knn.score(xtrain,ytrain)

0.9106191272125944

In [103]:
best_knn.score(xtest,ytest)

0.8275361964808616

### PREDICTIONS

In [104]:
ypred_tr= best_knn.predict(xtrain)
ypred_ts= best_knn.predict(xtest)

In [105]:
ypred_tr[0:5]

array([[ 98358.255],
       [131762.395],
       [111890.75 ],
       [147009.68 ],
       [117309.575]])

In [106]:
ytrain.head()

Unnamed: 0,PROFIT
30,99937.59
21,111313.02
19,122776.86
11,144259.4
22,110352.25


### PREDICT THE PROFIT

In [107]:
xnew= df.sample(10)
xnew

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
4,142107.34,91391.77,366168.42,Florida,166187.94
29,65605.48,153032.06,107138.38,New York,101004.64
43,15505.73,127382.3,35534.17,New York,69758.98
48,542.05,51743.15,0.0,New York,35673.41
0,165349.2,136897.8,471784.1,New York,192261.83
41,27892.92,84710.77,164470.71,Florida,77798.83
39,38558.51,82982.09,174999.3,California,81005.76
8,120542.52,148718.95,311613.29,New York,152211.77
47,0.0,135426.92,0.0,California,42559.73
30,61994.48,115641.28,91131.24,Florida,99937.59


In [109]:
x_new=xnew.drop(columns=['PROFIT'])
x_new

Unnamed: 0,RND,ADMIN,MKT,STATE
4,142107.34,91391.77,366168.42,Florida
29,65605.48,153032.06,107138.38,New York
43,15505.73,127382.3,35534.17,New York
48,542.05,51743.15,0.0,New York
0,165349.2,136897.8,471784.1,New York
41,27892.92,84710.77,164470.71,Florida
39,38558.51,82982.09,174999.3,California
8,120542.52,148718.95,311613.29,New York
47,0.0,135426.92,0.0,California
30,61994.48,115641.28,91131.24,Florida


In [110]:
x_new_pre=pre.transform(x_new)
x_new_pre

array([[ 1.5049372 , -1.07991935,  1.28152771,  0.        ,  1.        ,
         0.        ],
       [-0.17860854,  1.14245677, -0.85813366,  0.        ,  0.        ,
         1.        ],
       [-1.28113364,  0.21768152, -1.44960468,  0.        ,  0.        ,
         1.        ],
       [-1.61043334, -2.50940884, -1.74312698,  0.        ,  0.        ,
         1.        ],
       [ 2.01641149,  0.56075291,  2.15394309,  0.        ,  0.        ,
         1.        ],
       [-1.00853372, -1.32079581, -0.38455241,  0.        ,  1.        ,
         0.        ],
       [-0.77382036, -1.38312156, -0.29758328,  1.        ,  0.        ,
         0.        ],
       [ 1.03036886,  0.9869521 ,  0.83088691,  0.        ,  0.        ,
         1.        ],
       [-1.62236202,  0.50772188, -1.74312698,  1.        ,  0.        ,
         0.        ],
       [-0.25807437, -0.20562866, -0.99035717,  0.        ,  1.        ,
         0.        ]])

In [111]:
x_new_pre=pd.DataFrame(x_new_pre,columns=cols)
x_new_pre

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,1.504937,-1.079919,1.281528,0.0,1.0,0.0
1,-0.178609,1.142457,-0.858134,0.0,0.0,1.0
2,-1.281134,0.217682,-1.449605,0.0,0.0,1.0
3,-1.610433,-2.509409,-1.743127,0.0,0.0,1.0
4,2.016411,0.560753,2.153943,0.0,0.0,1.0
5,-1.008534,-1.320796,-0.384552,0.0,1.0,0.0
6,-0.77382,-1.383122,-0.297583,1.0,0.0,0.0
7,1.030369,0.986952,0.830887,0.0,0.0,1.0
8,-1.622362,0.507722,-1.743127,1.0,0.0,0.0
9,-0.258074,-0.205629,-0.990357,0.0,1.0,0.0


In [112]:
pred = best_knn.predict(x_new_pre)
pred

array([[161589.53 ],
       [111890.75 ],
       [ 39803.74 ],
       [ 58451.235],
       [187581.91 ],
       [ 87288.875],
       [ 76252.125],
       [131762.395],
       [ 56054.62 ],
       [ 98358.255]])

In [113]:
xnew['PREDICTIONS']=pred

In [114]:
xnew

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT,PREDICTIONS
4,142107.34,91391.77,366168.42,Florida,166187.94,161589.53
29,65605.48,153032.06,107138.38,New York,101004.64,111890.75
43,15505.73,127382.3,35534.17,New York,69758.98,39803.74
48,542.05,51743.15,0.0,New York,35673.41,58451.235
0,165349.2,136897.8,471784.1,New York,192261.83,187581.91
41,27892.92,84710.77,164470.71,Florida,77798.83,87288.875
39,38558.51,82982.09,174999.3,California,81005.76,76252.125
8,120542.52,148718.95,311613.29,New York,152211.77,131762.395
47,0.0,135426.92,0.0,California,42559.73,56054.62
30,61994.48,115641.28,91131.24,Florida,99937.59,98358.255


In [115]:
xnew.to_csv('Predictions.csv',index=False)