In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
tips=pd.read_csv("C://Users//ngigi//Downloads//tips.csv")

In [3]:
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [4]:
from sklearn.model_selection import train_test_split

#the target variable we want to predict is 'tip'
X = tips.drop(columns=['total_bill'])
y = tips['tip']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Verifying the split
dataset_splits = {
    'X_train_shape': X_train.shape,
    'X_test_shape': X_test.shape,
    'y_train_shape': y_train.shape,
    'y_test_shape': y_test.shape
}
dataset_splits

{'X_train_shape': (170, 6),
 'X_test_shape': (74, 6),
 'y_train_shape': (170,),
 'y_test_shape': (74,)}

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# Define which columns are numerical and categorical
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# We create the preprocessing pipelines for both numeric and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Create the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', RandomForestRegressor(random_state=42))
                       ])

# Train the model
model.fit(X_train, y_train)

In [6]:
# Make predictions on the test set
y_pred = model.predict(X_test)

y_pred

array([3.1699, 2.0005, 2.0005, 5.1787, 2.    , 2.0005, 2.5592, 2.5263,
       3.2647, 3.0002, 3.    , 1.4888, 1.4994, 2.0005, 1.7963, 1.2652,
       4.0006, 4.0016, 2.5   , 5.0024, 2.5   , 3.3648, 4.0288, 1.4888,
       3.498 , 2.263 , 1.718 , 5.0385, 2.0005, 5.0007, 4.0016, 1.4896,
       2.264 , 3.2239, 3.4984, 2.9129, 1.502 , 5.0102, 2.769 , 2.    ,
       1.2401, 2.    , 2.5466, 1.6579, 2.    , 1.0263, 1.6274, 2.264 ,
       1.4874, 2.7089, 2.0212, 4.0006, 4.4234, 2.0206, 3.4994, 2.    ,
       5.1557, 1.7491, 2.7495, 3.    , 5.1593, 3.4989, 5.7745, 2.5   ,
       2.0094, 4.0689, 2.0336, 1.0407, 3.4023, 3.4881, 2.0005, 3.    ,
       2.7128, 3.0002])

In [7]:
from sklearn.metrics import confusion_matrix, classification_report

# Since we are dealing with regression, confusion matrix and classification report are not applicable. Instead, we will calculate MSE and R-squared score.
from sklearn.metrics import mean_squared_error, r2_score

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate R-squared score
r2 = r2_score(y_test, y_pred)

mse, r2

(0.0009524358108108251, 0.9992774665286122)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import KBinsDiscretizer

# Discretize the target variable 'y'
kbins = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
y_binned = kbins.fit_transform(y.values.reshape(-1, 1)).ravel()
y_train_binned = kbins.transform(y_train.values.reshape(-1, 1)).ravel()
y_test_binned = kbins.transform(y_test.values.reshape(-1, 1)).ravel()

# Define the model and GridSearchCV
param_grid = {'knn__n_neighbors': [3, 5, 7, 9], 'knn__weights': ['uniform', 'distance']}
grid_search = GridSearchCV(Pipeline(steps=[('preprocessor', preprocessor), ('knn', KNeighborsClassifier())]), param_grid, cv=5, scoring='accuracy')

# Train and get best params and score
grid_search.fit(X_train, y_train_binned)
best_params, best_score = grid_search.best_params_, grid_search.best_score_
best_params, best_score



({'knn__n_neighbors': 3, 'knn__weights': 'distance'}, 0.9117647058823529)

In [9]:
print(f"Model: K-Nearest Neighbors Classifier (kNN)")
print(f"Best Score: {best_score}")
print(f"Best Parameters: {best_params}")

Model: K-Nearest Neighbors Classifier (kNN)
Best Score: 0.9117647058823529
Best Parameters: {'knn__n_neighbors': 3, 'knn__weights': 'distance'}
