#### Name : SINU S MARIAM
#### Designation : Machine Learning Intern  
#### Ref : CTI/A1/C41852
#### Organization : COGNIFYZ TECHNOLOGIES
#### Batch Date: 23/06/2024 to 23/09/2024 
#### Task Date : 23/07/2024 to 23/08/2024
#### Second Month Task  - RAILWAY INFORMATION PROCESSING

#### Task 4.2: Model Deployment
 
* Prepare the final model for deployment, ensuring it is optimized and efficient.
* Develop a pipeline for model deployment, including data preprocessing, feature engineering, and prediction.

In [1]:
# Import necessary libraries
import pandas as pd
# from Scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
#importing Data
data_frame = pd.read_csv('C:/Users/sinus/OneDrive/Documents/COGNIFYZ/secondmonth/Railway_info.csv')
# rows and columns of the data
print(data_frame.shape)
# visualise the dataset
data_frame.head()

(11113, 5)


Unnamed: 0,Train_No,Train_Name,Source_Station_Name,Destination_Station_Name,days
0,107,SWV-MAO-VLNK,SAWANTWADI ROAD,MADGOAN JN.,Saturday
1,108,VLNK-MAO-SWV,MADGOAN JN.,SAWANTWADI ROAD,Friday
2,128,MAO-KOP SPEC,MADGOAN JN.,CHHATRAPATI SHAHU MAHARAJ TERMINUS,Friday
3,290,PALACE ON WH,DELHI-SAFDAR JANG,DELHI-SAFDAR JANG,Wednesday
4,401,BSB BHARATDA,AURANGABAD,VARANASI JN.,Saturday


In [3]:
class ReplaceDaysTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.replacements = {
            'Mondayd': 'Monday',
            'Tuesdayd': 'Tuesday',
            'Wednesdayd': 'Wednesday',
            'Thursdayd': 'Thursday',
            'Fridayd': 'Friday',
            'Saturdayd': 'Saturday',
            'Sundayd': 'Sunday'
        }
    
    def fit(self, X, y=None):
        return self  # Nothing to fit here

    def transform(self, X):
        X['days'] = X['days'].replace(self.replacements)
        return X

In [4]:
# Define function for cleaning data
import re
def Pre_process_text(act_string):
    # remove all values other than alphabets
    new_string= re.sub(r'[^a-zA-Z0-9]'," ", act_string)
    # Lowercasing
    new_string = act_string.upper()
    return new_string

In [5]:
# class for cleaning Text
class MultiColumnTextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        for column in self.columns:
            X[column] = X[column].apply(Pre_process_text)
        return X
# Specify the columns to be cleaned
columns_to_clean = ['Source_Station_Name', 'Destination_Station_Name', 'days']

In [6]:
# Define the function to create a weekday or weekend
def weekday_or_not(day):
    day = day.upper()  # Ensure that the day is in uppercase for consistent comparison
    if day in ['SATURDAY', 'SUNDAY']:
        return 'Weekend'
    else:
        return 'Weekday'

In [7]:
#Class for distinguishing weekend or weekday
class WeekendWeekdayTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, day_column, new_column):
        self.day_column = day_column
        self.new_column = new_column
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        X[self.new_column] = X[self.day_column].apply(weekday_or_not)
        return X

In [8]:
# class for label encoding
class MultiColumnLabelEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns=None):
        self.columns = columns
        self.encoders = {column: LabelEncoder() for column in columns}
    
    def fit(self, X, y=None):
        for column in self.columns:
            self.encoders[column].fit(X[column])
        return self
    
    def transform(self, X, y=None):
        #X = X.copy()  # To avoid modifying the original DataFrame
        for column in self.columns:
            X[column] = self.encoders[column].transform(X[column])
        return X
feature_columns = ['Train_Name', 'Source_Station_Name', 'Destination_Station_Name', 'days', 'WEEKEND_OR_WEEKDAY']

In [9]:
# Frequency of Operation
data_frame['Frequency_of_operation'] = data_frame.groupby(['Train_No','Train_Name', 'Source_Station_Name', 'Destination_Station_Name'])['Train_No'].transform('count')

In [10]:
#First step in building the model is to identify the Feature(Input) variables and Target (Output) variable
features = data_frame.drop(['Frequency_of_operation'], axis=1)
target = data_frame['Frequency_of_operation']
# Splitting data for training the model and testing the model
# train size taken as 0.8
X_train, X_test, y_train, y_test = train_test_split(features, target, train_size = .8)

In [11]:
# Pre processing Pipeline
Pre_pipeline = Pipeline([
    ('replace_days', ReplaceDaysTransformer()),
    ('text_cleaning', MultiColumnTextCleaner(columns=columns_to_clean)),
    ('weekend_weekday', WeekendWeekdayTransformer(day_column='days', new_column='WEEKEND_OR_WEEKDAY')),
    ('label_encoding', MultiColumnLabelEncoder(columns=feature_columns)),
    ('scaler', MinMaxScaler()),  
])

In [12]:
# Fit and transform X_train
X_train_processed = Pre_pipeline.fit_transform(X_train)

In [13]:
# Transform X_test
X_test_processed = Pre_pipeline.fit_transform(X_test)

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import mean_squared_error
import joblib

In [15]:
# Define the pipeline for modelling
pipeline = Pipeline([
    ('rf', RandomForestRegressor(random_state=42))  # Step 2: Random Forest Regressor
])

In [16]:
# Hyperparameters grid to search over
param_grid = {
    'rf__n_estimators': [50, 100, 200],      # Number of trees in the forest
    'rf__max_depth': [None, 10, 20, 30],     # Maximum depth of the tree
    'rf__min_samples_split': [2, 5, 10],     # Minimum number of samples required to split an internal node
    'rf__min_samples_leaf': [1, 2, 4]        # Minimum number of samples required to be at a leaf node
}

# GridSearchCV with the pipeline
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the pipeline with the training data
grid_search.fit(X_train_processed, y_train)

# Extract the best model from the grid search
best_pipeline = grid_search.best_estimator_

# Make predictions on the test data
y_pred_RFG = best_pipeline.predict(X_test_processed)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred_RFG)
rmse = mse ** 0.5

print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")

# Save the best pipeline for future use
joblib.dump(best_pipeline, 'best_random_forest_pipeline.pkl')

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best Hyperparameters: {'rf__max_depth': None, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 50}
Mean Squared Error: 0.0
Root Mean Squared Error: 0.0


['best_random_forest_pipeline.pkl']

In [17]:
#### Evaluation

In [18]:
# find Coefficient of Determination or R Squared Value (r2)
r2_score(y_test,y_pred_RFG)

1.0

In [19]:
accuracy_score(y_test, y_pred_RFG)

1.0

In [20]:
precision_score(y_test, y_pred_RFG)

1.0

In [21]:
recall_score(y_test, y_pred_RFG)

1.0

-------------------------------THE END----------------------------------