In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/project machine learning/Indian Airlines.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
df['price']=np.log1p(df['price'])

In [None]:
for i in df.columns:
  if df[i].dtype == 'object':
    print(i)
    print(len(df[i].unique()))

In [None]:
df.drop(columns=['flight'],inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder


le = LabelEncoder()


le.fit(df['class'])
df['class'] = le.transform(df['class'])

In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe1 = OneHotEncoder(sparse_output=False)  # drop="first" prevents dummy variable trap
airline_encoded = ohe1.fit_transform(df[["airline"]])
# Convert to DataFrame with column names
airline_df = pd.DataFrame(airline_encoded, columns=ohe1.get_feature_names_out(["airline"]))
# Drop original "Airline" column and merge encoded data
df = df.drop(columns=["airline"]).reset_index(drop=True)
df = pd.concat([df, airline_df], axis=1)


In [None]:
ohe3 = OneHotEncoder(sparse_output=False)
source = ohe3.fit_transform(df[["source_city"]])
source_df = pd.DataFrame(source, columns=ohe3.get_feature_names_out(["source_city"]))
df = df.drop(columns=["source_city"]).reset_index(drop=True)
df = pd.concat([df, source_df], axis=1)

In [None]:
ohe3

In [None]:
ohe4 = OneHotEncoder(sparse_output=False)
dep = ohe4.fit_transform(df[["departure_time"]])
dep_df = pd.DataFrame(dep, columns=ohe4.get_feature_names_out(["departure_time"]))
df = df.drop(columns=["departure_time"]).reset_index(drop=True)
df = pd.concat([df, dep_df], axis=1)

In [None]:
ohe5 = OneHotEncoder(sparse_output=False)
sto = ohe5.fit_transform(df[["stops"]])
sto_df = pd.DataFrame(sto, columns=ohe5.get_feature_names_out(["stops"]))
df = df.drop(columns=["stops"]).reset_index(drop=True)
df = pd.concat([df, sto_df], axis=1)

In [None]:
ohe6 = OneHotEncoder(sparse_output=False)
at = ohe6.fit_transform(df[["arrival_time"]])
at_df = pd.DataFrame(at, columns=ohe6.get_feature_names_out(["arrival_time"]))
df = df.drop(columns=["arrival_time"]).reset_index(drop=True)
df = pd.concat([df, at_df], axis=1)

In [None]:
ohe7 = OneHotEncoder(sparse_output=False)
destination = ohe7.fit_transform(df[["destination_city"]])
destination_df = pd.DataFrame(destination, columns=ohe7.get_feature_names_out(["destination_city"]))
df = df.drop(columns=["destination_city"]).reset_index(drop=True)
df = pd.concat([df, destination_df], axis=1)

In [None]:
# List of columns to create box plots for
columns = ['duration', 'days_left', 'price']

# Iterate through the columns and create box plots
for column in columns:
    plt.figure()  # Create a new figure for each plot
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')  # Set title using f-string formatting
    plt.show()

In [None]:
columns = ['duration', 'days_left', 'price']

for column in columns:
  Q1 = df[column].quantile(0.25)
  Q3 = df[column].quantile(0.75)
  print(f"Column: {column}")
  print(f"Lower Quartile (Q1): {Q1}")
  print(f"Upper Quartile (Q3): {Q3}")
  print("\n")


In [None]:
columns = ['duration', 'days_left', 'price']

for column in columns:
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)][column]

    print(f"Column: {column}")
    print(f"Number of outliers: {len(outliers)}")
    print("\n")


In [None]:
df_sorted = df.sort_values(by=['price'], ascending=False)
df_sorted.head() # To view the first few rows of the sorted DataFrame


In [None]:
df1 = df.copy() # Call the copy method using parentheses ()
df1[df['price']>100000]

In [None]:
df1 = df1[df['price'] <= 100000]


In [None]:
df1[df['price']>100000]

In [None]:
df1 = df1[df['duration'] > 44]


In [None]:
df1.shape

In [None]:
df1

In [None]:
df1 = df1[df['duration'] < 44]


In [None]:
df

In [None]:
df1.shape


In [None]:
!pip install imbalanced-learn #Install imbalanced-learn, which contains imblearn

In [None]:
df.shape

In [None]:
# Define features (X) and target (y)
X = df.drop(columns=['price'])
y = df['price']

In [None]:
X

In [None]:
df = df.sample(n=10000,replace=False,random_state=42)
df.info()

In [None]:
y

In [None]:
correlation = df.corr()
plt.figure(figsize=(25,25))
sns.heatmap(correlation, annot=True)
plt.show()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print(y_train.value_counts())

In [None]:
X.head()

In [None]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Fit the scaler on the training data and transform both train and test data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Create and train the model
model = LinearRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

In [None]:
from sklearn.metrics import r2_score

# ... (your code for training and prediction) ...

r2 = r2_score(y_test, y_pred)
print(f"R-squared: {r2}")

.)decision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Assuming x_train_scaled, y_train are already defined

# Initialize the Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=42)

# Define the hyperparameter distribution
param_dist_dt = {
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
    'splitter': ['best', 'random'],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2'],
}

# Initialize Randomized Search
random_search_dt = RandomizedSearchCV(
    estimator=dt,
    param_distributions=param_dist_dt,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,  # Use all available cores for parallel processing
    random_state=42  #Adding this for reproducibility to give the same result as the example you provided
)

# Fit Randomized Search to the data
random_search_dt.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_dt = random_search_dt.best_estimator_
best_params_dt = random_search_dt.best_params_

print(f"Best Hyperparameters: {best_params_dt}")

# Now you have the same model as before, stored in 'best_model_dt'

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming best_model_dt, X_test_scaled, and y_test are already defined
y_pred_dt = best_model_dt.predict(X_test_scaled)

# Calculate MSE
mse_dt = mean_squared_error(y_test, y_pred_dt)

# Calculate MAE
mae_dt = mean_absolute_error(y_test, y_pred_dt)

# Calculate MAPE
mape_dt = np.mean(np.abs((y_test - y_pred_dt) / y_test)) * 100

# Calculate R2
r2_dt = r2_score(y_test, y_pred_dt)

# Print the results
print(f"Decision Tree - Mean Squared Error (MSE): {mse_dt}")
print(f"Decision Tree - Mean Absolute Error (MAE): {mae_dt}")
print(f"Decision Tree - Mean Absolute Percentage Error (MAPE): {mape_dt}")
print(f"Decision Tree - R-squared (R2): {r2_dt}")

 .)Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Assuming X_train_scaled, y_train are already defined

# Initialize the Random Forest Regressor
rf = RandomForestRegressor(random_state=42)

# Define the hyperparameter distribution for Random Forest
param_dist_rf = {
    'n_estimators': randint(50, 200),  # Number of trees in the forest
    'max_depth': [10, 20, 30, 40, 50, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': ['sqrt', 'log2', None],
    'bootstrap': [True, False]  # Whether bootstrap samples are used
}

# Initialize Randomized Search for Random Forest
random_search_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist_rf,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,  # Use all available cores for parallel processing
    random_state=42
)

# Fit Randomized Search to the data
random_search_rf.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_rf = random_search_rf.best_estimator_
best_params_rf = random_search_rf.best_params_

print(f"Best Hyperparameters for Random Forest: {best_params_rf}")

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming best_model_rf, X_test_scaled, and y_test are already defined
y_pred_rf = best_model_rf.predict(X_test_scaled)

# Calculate MSE
mse_rf = mean_squared_error(y_test, y_pred_rf)

# Calculate MAE
mae_rf = mean_absolute_error(y_test, y_pred_rf)

# Calculate MAPE
mape_rf = np.mean(np.abs((y_test - y_pred_rf) / y_test)) * 100

# Calculate R2
r2_rf = r2_score(y_test, y_pred_rf)

# Print the results
print(f"Random Forest - Mean Squared Error (MSE): {mse_rf}")
print(f"Random Forest - Mean Absolute Error (MAE): {mae_rf}")
print(f"Random Forest - Mean Absolute Percentage Error (MAPE): {mape_rf}")
print(f"Random Forest - R-squared (R2): {r2_rf}")

2. Linear Regression



In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# Initialize Linear Regression
linear_model = LinearRegression()

# Define the hyperparameter distribution (Limited options for Linear Regression)
param_dist_linear = {
    'fit_intercept': [True, False],
    'positive': [True, False]  # Constraint coefficients to be positive
}

# Initialize Randomized Search for Linear Regression
random_search_linear = RandomizedSearchCV(
    estimator=linear_model,
    param_distributions=param_dist_linear,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Fit Randomized Search to the data
random_search_linear.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_linear = random_search_linear.best_estimator_
best_params_linear = random_search_linear.best_params_

print(f"Best Hyperparameters for Linear Regression: {best_params_linear}")

# Make predictions on the test set using the best model
y_pred_linear = best_model_linear.predict(X_test_scaled)

# Calculate MSE
mse_linear = mean_squared_error(y_test, y_pred_linear)

# Calculate MAE
mae_linear = mean_absolute_error(y_test, y_pred_linear)

# Calculate MAPE
mape_linear = np.mean(np.abs((y_test - y_pred_linear) / y_test)) * 100

# Calculate R2
r2_linear = r2_score(y_test, y_pred_linear)

# Print the results
print(f"Linear Regression - Mean Squared Error (MSE): {mse_linear}")
print(f"Linear Regression - Mean Absolute Error (MAE): {mae_linear}")
print(f"Linear Regression - Mean Absolute Percentage Error (MAPE): {mape_linear}")
print(f"Linear Regression - R-squared (R2): {r2_linear}")

3. Lasso Regression



In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# Initialize Lasso Regression
lasso_model = Lasso(random_state=42)

# Define the hyperparameter distribution for Lasso
param_dist_lasso = {
    'alpha': expon(scale=1.0),  # Regularization strength (alpha)
    'selection': ['cyclic', 'random']  # Feature selection method
}

# Initialize Randomized Search for Lasso
random_search_lasso = RandomizedSearchCV(
    estimator=lasso_model,
    param_distributions=param_dist_lasso,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Fit Randomized Search to the data
random_search_lasso.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_lasso = random_search_lasso.best_estimator_
best_params_lasso = random_search_lasso.best_params_

print(f"Best Hyperparameters for Lasso Regression: {best_params_lasso}")

# Make predictions on the test set using the best model
y_pred_lasso = best_model_lasso.predict(X_test_scaled)

# Calculate MSE
mse_lasso = mean_squared_error(y_test, y_pred_lasso)

# Calculate MAE
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)

# Calculate MAPE
mape_lasso = np.mean(np.abs((y_test - y_pred_lasso) / y_test)) * 100

# Calculate R2
r2_lasso = r2_score(y_test, y_pred_lasso)

# Print the results
print(f"Lasso Regression - Mean Squared Error (MSE): {mse_lasso}")
print(f"Lasso Regression - Mean Absolute Error (MAE): {mae_lasso}")
print(f"Lasso Regression - Mean Absolute Percentage Error (MAPE): {mape_lasso}")
print(f"Lasso Regression - R-squared (R2): {r2_lasso}")

4. Ridge Regression



In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# Initialize Ridge Regression
ridge_model = Ridge(random_state=42)

# Define the hyperparameter distribution for Ridge
param_dist_ridge = {
    'alpha': expon(scale=1.0),  # Regularization strength (alpha)
    'solver': ['svd', 'cholesky', 'lsqr', 'sag', 'saga']  # Solver to use
}

# Initialize Randomized Search for Ridge
random_search_ridge = RandomizedSearchCV(
    estimator=ridge_model,
    param_distributions=param_dist_ridge,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Fit Randomized Search to the data
random_search_ridge.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_ridge = random_search_ridge.best_estimator_
best_params_ridge = random_search_ridge.best_params_

print(f"Best Hyperparameters for Ridge Regression: {best_params_ridge}")

# Make predictions on the test set using the best model
y_pred_ridge = best_model_ridge.predict(X_test_scaled)

# Calculate MSE
mse_ridge = mean_squared_error(y_test, y_pred_ridge)

# Calculate MAE
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)

# Calculate MAPE
mape_ridge = np.mean(np.abs((y_test - y_pred_ridge) / y_test)) * 100

# Calculate R2
r2_ridge = r2_score(y_test, y_pred_ridge)

# Print the results
print(f"Ridge Regression - Mean Squared Error (MSE): {mse_ridge}")
print(f"Ridge Regression - Mean Absolute Error (MAE): {mae_ridge}")
print(f"Ridge Regression - Mean Absolute Percentage Error (MAPE): {mape_ridge}")
print(f"Ridge Regression - R-squared (R2): {r2_ridge}")

5. SVM



In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import expon, uniform
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# Initialize SVM Regressor
svm_model = SVR()

# Define the hyperparameter distribution for SVM
param_dist_svm = {
    'C': expon(scale=1.0),  # Regularization parameter
    'epsilon': expon(scale=0.1),  # Width of the epsilon-tube
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Kernel type
    'gamma': expon(scale=0.1),  # Kernel coefficient
}

# Initialize Randomized Search for SVM
random_search_svm = RandomizedSearchCV(
    estimator=svm_model,
    param_distributions=param_dist_svm,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Fit Randomized Search to the data
random_search_svm.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_svm = random_search_svm.best_estimator_
best_params_svm = random_search_svm.best_params_

print(f"Best Hyperparameters for SVM: {best_params_svm}")

# Make predictions on the test set using the best model
y_pred_svm = best_model_svm.predict(X_test_scaled)

# Calculate MSE
mse_svm = mean_squared_error(y_test, y_pred_svm)

# Calculate MAE
mae_svm = mean_absolute_error(y_test, y_pred_svm)

# Calculate MAPE
mape_svm = np.mean(np.abs((y_test - y_pred_svm) / y_test)) * 100

# Calculate R2
r2_svm = r2_score(y_test, y_pred_svm)

# Print the results
print(f"SVM - Mean Squared Error (MSE): {mse_svm}")
print(f"SVM - Mean Absolute Error (MAE): {mae_svm}")
print(f"SVM - Mean Absolute Percentage Error (MAPE): {mape_svm}")
print(f"SVM - R-squared (R2): {r2_svm}")

6. Gradient Boosting



In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Assuming X_train_scaled, y_train, X_test_scaled, and y_test are already defined

# Initialize Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(random_state=42)

# Define the hyperparameter distribution for Gradient Boosting
param_dist_gb = {
    'n_estimators': randint(50, 200),
    'learning_rate': uniform(0.01, 0.3),
    'max_depth': randint(3, 10),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'max_features': ['sqrt', 'log2', None],
}

# Initialize Randomized Search for Gradient Boosting
random_search_gb = RandomizedSearchCV(
    estimator=gb_model,
    param_distributions=param_dist_gb,
    scoring='neg_mean_squared_error',
    cv=5,
    n_jobs=-1,
    random_state=42
)

# Fit Randomized Search to the data
random_search_gb.fit(X_train_scaled, y_train)

# Get the best model and its hyperparameters
best_model_gb = random_search_gb.best_estimator_
best_params_gb = random_search_gb.best_params_

print(f"Best Hyperparameters for Gradient Boosting: {best_params_gb}")

# Make predictions on the test set using the best model
y_pred_gb = best_model_gb.predict(X_test_scaled)

# Calculate MSE
mse_gb = mean_squared_error(y_test, y_pred_gb)

# Calculate MAE
mae_gb = mean_absolute_error(y_test, y_pred_gb)

# Calculate MAPE
mape_gb = np.mean(np.abs((y_test - y_pred_gb) / y_test)) * 100

# Calculate R2
r2_gb = r2_score(y_test, y_pred_gb)

# Print the results
print(f"Gradient Boosting - Mean Squared Error (MSE): {mse_gb}")
print(f"Gradient Boosting - Mean Absolute Error (MAE): {mae_gb}")
print(f"Gradient Boosting - Mean Absolute Percentage Error (MAPE): {mape_gb}")
print(f"Gradient Boosting - R-squared (R2): {r2_gb}")

In [None]:
import pickle

In [None]:
pickle.dump(best_model_linear,open('best_model_linear.sav','wb'))

In [None]:
pickle.dump(le,open('le.sav','wb'))

In [None]:
pickle.dump(ohe1,open('ohe1.sav','wb'))

In [None]:
pickle.dump(ohe3,open('ohe3.sav','wb'))

In [None]:
pickle.dump(ohe4,open('ohe4.sav','wb'))

In [None]:
pickle.dump(ohe5,open('ohe5.sav','wb'))

In [None]:
pickle.dump(ohe6,open('ohe6.sav','wb'))

In [None]:
pickle.dump(ohe7,open('ohe7.sav','wb'))

In [None]:
pickle.dump(scaler,open('scaler.sav','wb'))

In [None]:
X_train.columns