In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### Load and Explore the Wine Quality Dataset

In [1]:
import pandas as pd
data=pd.read_csv("https://raw.githubusercontent.com/aniruddhachoudhury/Red-Wine-Quality/master/winequality-red.csv")

In [2]:
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [6]:
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
fixed acidity,1599.0,8.319637,1.741096,4.6,7.1,7.9,9.2,15.9
volatile acidity,1599.0,0.527821,0.17906,0.12,0.39,0.52,0.64,1.58
citric acid,1599.0,0.270976,0.194801,0.0,0.09,0.26,0.42,1.0
residual sugar,1599.0,2.538806,1.409928,0.9,1.9,2.2,2.6,15.5
chlorides,1599.0,0.087467,0.047065,0.012,0.07,0.079,0.09,0.611
free sulfur dioxide,1599.0,15.874922,10.460157,1.0,7.0,14.0,21.0,72.0
total sulfur dioxide,1599.0,46.467792,32.895324,6.0,22.0,38.0,62.0,289.0
density,1599.0,0.996747,0.001887,0.99007,0.9956,0.99675,0.997835,1.00369
pH,1599.0,3.311113,0.154386,2.74,3.21,3.31,3.4,4.01
sulphates,1599.0,0.658149,0.169507,0.33,0.55,0.62,0.73,2.0


In [8]:
data['quality'].value_counts()

quality
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

####  Split features and target

In [9]:
X = data.drop('quality',axis=1)

In [10]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [11]:
y = data['quality']

In [12]:
y.head()

0    5
1    5
2    5
3    6
4    5
Name: quality, dtype: int64

In [14]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.33)

In [15]:
X_train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
548,12.4,0.35,0.49,2.6,0.079,27.0,69.0,0.9994,3.12,0.75,10.4
355,6.7,0.75,0.01,2.4,0.078,17.0,32.0,0.9955,3.55,0.61,12.8
1296,6.6,0.63,0.0,4.3,0.093,51.0,77.5,0.99558,3.2,0.45,9.5
209,11.0,0.3,0.58,2.1,0.054,7.0,19.0,0.998,3.31,0.88,10.5
140,8.4,0.745,0.11,1.9,0.09,16.0,63.0,0.9965,3.19,0.82,9.6


### Feature scaling

In [18]:
scaler = StandardScaler()

In [19]:
scaler.fit(X_train) # calculate mean & std deviation

In [20]:
print(scaler.mean_)

[ 8.30345472  0.53246499  0.26933707  2.54691877  0.08772736 15.91223156
 46.76330532  0.99677933  3.31453782  0.65881419 10.41521942]


In [22]:
X_train_tf = scaler.transform(X_train)
X_train_tf

array([[ 2.40069523, -1.03103722,  1.12742595, ..., -1.26096312,
         0.52726134, -0.01431863],
       [-0.93967131,  1.22920403, -1.32502245, ...,  1.52622836,
        -0.28225704,  2.24363201],
       [-0.99827424,  0.55113165, -1.37611513, ..., -0.74241587,
        -1.20742091, -0.86105011],
       ...,
       [-0.6466567 ,  0.49462562, -1.06955908, ...,  1.26695473,
        -0.68701624, -0.86105011],
       [-0.23643625, -1.87862768,  0.4121285 , ...,  0.03540501,
         0.81637505,  1.39690052],
       [-1.46709761, -1.3700734 , -0.04770558, ...,  0.48913386,
        -0.68701624,  2.90220094]])

## Support Vector Classifier (SVC) Model

In [25]:
model = SVC()

In [26]:
model.fit(X_train_tf,y_train)

In [27]:
# Accuracy
model.score(X_train_tf,y_train)

0.6778711484593838

In [29]:
X_test_tf = scaler.transform(X_test)

In [33]:

y_predict = model.predict(X_test_tf)

In [34]:
y_predict

array([5, 5, 6, 5, 6, 5, 5, 5, 6, 6, 6, 5, 6, 5, 5, 7, 5, 6, 7, 5, 5, 5,
       6, 6, 5, 5, 6, 5, 5, 6, 5, 5, 6, 5, 6, 5, 6, 6, 5, 6, 5, 5, 6, 5,
       6, 6, 6, 6, 5, 6, 5, 5, 6, 7, 5, 5, 6, 5, 6, 5, 6, 6, 5, 5, 7, 5,
       6, 5, 7, 5, 6, 5, 6, 6, 6, 5, 7, 5, 6, 7, 5, 7, 5, 5, 6, 6, 5, 6,
       6, 5, 6, 5, 5, 6, 5, 6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 6, 5,
       6, 5, 6, 5, 6, 6, 6, 5, 5, 6, 6, 6, 6, 5, 5, 5, 6, 6, 5, 6, 6, 5,
       5, 6, 6, 5, 5, 5, 5, 6, 6, 6, 6, 5, 6, 5, 6, 5, 6, 5, 6, 6, 5, 6,
       6, 6, 5, 6, 5, 6, 7, 6, 6, 5, 5, 6, 5, 5, 5, 5, 5, 5, 6, 5, 7, 6,
       6, 5, 5, 5, 5, 7, 5, 7, 5, 6, 6, 6, 7, 5, 6, 6, 5, 6, 6, 5, 5, 5,
       6, 6, 5, 5, 5, 5, 7, 6, 5, 5, 6, 6, 7, 5, 6, 6, 6, 6, 6, 5, 6, 5,
       5, 6, 6, 6, 5, 5, 5, 7, 5, 5, 5, 5, 6, 6, 5, 6, 5, 6, 6, 5, 5, 5,
       6, 6, 5, 6, 6, 5, 6, 5, 6, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 5, 7,
       6, 7, 6, 5, 6, 6, 5, 6, 5, 5, 5, 5, 6, 6, 6, 5, 7, 5, 5, 5, 5, 6,
       5, 6, 5, 6, 5, 7, 6, 5, 5, 6, 5, 6, 6, 7, 5,

In [31]:
y_test

803     6
124     5
350     6
682     5
1326    6
       ..
813     4
377     7
898     7
126     5
819     5
Name: quality, Length: 528, dtype: int64

In [35]:
accuracy_score(y_predict,y_test)

0.5984848484848485

#### using GridSearchCV

##### Define the Hyperparameter Grid for GridSearchCV

In [43]:
# parameter grid with a range of values to test
param_grid = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1]
}

##### Set Up GridSearchCV

In [46]:
# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available CPU cores
    verbose=1
)

In [47]:
# Fit GridSearchCV
grid_search.fit(X_train_tf, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


##### Evaluate the Best Model

In [48]:
# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Accuracy:", grid_search.best_score_)

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Cross-Validation Accuracy: 0.6265333623125408


In [49]:
# Evaluate on test set
best_model = grid_search.best_estimator_
test_accuracy = best_model.score(X_test_tf, y_test)
print("Test Accuracy:", test_accuracy)

Test Accuracy: 0.615530303030303


## Logistic regression

In [38]:
model2 = LogisticRegression()

In [39]:
model2.fit(X_train_tf,y_train)

In [40]:
y_predict2 = model2.predict(X_test_tf)

In [41]:
accuracy_score(y_test,y_predict2)

0.571969696969697

## Support Vector Regression (SVR) for ADMISSION_PREDICTION dataset

#### 1. Load and Explore the Dataset

In [2]:
# 1. Load the dataset
url = "https://raw.githubusercontent.com/srinivasav22/Graduate-Admission-Prediction/master/Admission_Predict_Ver1.1.csv"
data = pd.read_csv(url)
data.info();display(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Serial No.         500 non-null    int64  
 1   GRE Score          500 non-null    int64  
 2   TOEFL Score        500 non-null    int64  
 3   University Rating  500 non-null    int64  
 4   SOP                500 non-null    float64
 5   LOR                500 non-null    float64
 6   CGPA               500 non-null    float64
 7   Research           500 non-null    int64  
 8   Chance of Admit    500 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 35.3 KB


Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [12]:
data.columns = [col.strip() for col in data.columns ] # Stripping the white space from column names

In [13]:
data.columns

Index(['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR', 'CGPA',
       'Research', 'Chance of Admit'],
      dtype='object')

#### 2. Preprocess the data

In [5]:
# Drop Serial No. column
data = data.drop("Serial No.", axis=1)

In [7]:
# Check for missing values
data.isnull().sum()

GRE Score            0
TOEFL Score          0
University Rating    0
SOP                  0
LOR                  0
CGPA                 0
Research             0
Chance of Admit      0
dtype: int64

#### 3. Split the Dataset

In [14]:
X = data.drop('Chance of Admit',axis=1)
y= data['Chance of Admit']

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42,test_size=0.33)

In [17]:
# Scale features 
# standardize features  using StandardScaler to have zero mean and unit variance.
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [19]:
print(scaler.mean_)

[316.7641791  107.31343284   3.09552239   3.38059701   3.50149254
   8.58498507   0.56716418]


#### 4. Train the SVR Model

In [18]:
# Train SVR model
svr = SVR()

In [20]:
svr.fit(X_train_scaled,y_train)

In [22]:
svr.score(X_train_scaled,y_train)

0.7757519357301843

In [25]:
X_test_transform = scaler.transform(X_test)

#### 5.Evaluate the model

In [26]:
y_predict = svr.predict(X_test_transform)

In [28]:
mse = mean_squared_error(y_test,y_predict)
rmse = np.sqrt(mse)
r2 = r2_score(y_test,y_predict)
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

MSE: 0.0049
RMSE: 0.0703
R² Score: 0.7602


#### 6.Hyperparameter tuning

In [31]:
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.5],
    'gamma': ['scale', 'auto', 0.1, 0.01],
    'kernel': ['rbf']
}
grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_scaled, y_train)
print("Best parameters:", grid_search.best_params_)
print("Best R² score:", grid_search.best_score_)

Best parameters: {'C': 10, 'epsilon': 0.01, 'gamma': 0.01, 'kernel': 'rbf'}
Best R² score: 0.8000440561770802


#### 7.Predict with best model

In [32]:
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_transform)
print(f"Test R² with best model: {r2_score(y_test, y_pred_best):.4f}")

Test R² with best model: 0.8191
