## KNN Regression

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')


In [2]:
import pandas as pd
df = pd.read_csv('50_Startups.csv')
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [4]:
df.shape

(50, 5)

In [5]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [6]:
df.duplicated().sum()

0

Separate X and Y

In [7]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [8]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [9]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [10]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [11]:
cat

['STATE']

In [12]:
con

['RND', 'ADMIN', 'MKT']

Preprocessing the data

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [14]:
num_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='mean')),
                           ('scaler', StandardScaler())])

In [15]:
cat_pipe = Pipeline(steps=[('impute', SimpleImputer(strategy='most_frequent')),
                           ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

In [16]:
pre = ColumnTransformer([('num', num_pipe, con),
                         ('cat', cat_pipe, cat)]).set_output(transform='pandas')

In [17]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


Train Test Split

In [26]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X_pre, Y, test_size=0.33, random_state=21)

In [27]:
xtrain.shape

(33, 6)

In [28]:
ytrain.shape

(33, 1)

In [29]:
xtest.shape

(17, 6)

In [30]:
ytest.shape

(17, 1)

Create a KNN Regression Model

In [33]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=5)
model.fit(xtrain, ytrain)

In [34]:
model.score(xtrain, ytrain)

0.7820687508008062

HyperParameter Tuning

In [35]:
params = {'n_neighbors' : [3,4,5,6,7,8,9,10,11]}

In [36]:
from sklearn.model_selection import RandomizedSearchCV
knn = KNeighborsRegressor()
rscv = RandomizedSearchCV(knn, param_distributions=params, cv=5, scoring='neg_mean_squared_error')
rscv.fit(xtrain, ytrain)

In [37]:
rscv.best_params_

{'n_neighbors': 4}

In [38]:
rscv.best_score_

-551818848.1586883

In [39]:
best_knn = rscv.best_estimator_
best_knn

Evaluate Model on Train and Test

In [40]:
best_knn.score(xtrain, ytrain)

0.8101409044403108

In [41]:
best_knn.score(xtest, ytest)

0.8331171941280375

Predcit the Results from Train and Test

In [42]:
ypred_tr = best_knn.predict(xtrain)
ypred_ts = best_knn.predict(xtest)

In [43]:
ypred_tr[0:5]

array([[132383.5675],
       [101683.34  ],
       [ 65461.91  ],
       [ 61356.39  ],
       [ 69509.99  ]])

In [44]:
ytrain.head()

Unnamed: 0,PROFIT
13,134307.35
28,103282.38
49,14681.4
40,78239.91
45,64926.08


In [45]:
ypred_ts[0:5]

array([[141046.535 ],
       [ 71765.4875],
       [ 69509.99  ],
       [101671.975 ],
       [130949.4325]])

In [46]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65


Plot the Confusion Matrix

In [51]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [52]:
mse = mean_squared_error(ytest, ypred_ts)
mse

250321112.53392038

In [53]:
mae = mean_absolute_error(ytest, ypred_ts)
mae

12931.429264705885

In [54]:
r2 = r2_score(ytest, ypred_ts)
r2

0.8331171941280375

In [55]:
def adj_r2(xtrain, ytrain, model):
    r2 = model.score(xtrain, ytrain)
    N = xtrain.shape[0]
    p = xtrain.shape[1]
    num = (1-r2)*(N-1)
    den = N-p-1
    r2a = 1-(num/den)
    return r2a

In [56]:
adj_r2(xtrain, ytrain, best_knn)

0.7663272670034595