## Loading and Initializing training data

In [None]:
import pandas as pd

# mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# loading datasets
survey_train = pd.read_csv('/content/drive/MyDrive/Colab Files/shinkansen_datasets/Surveydata_train_(1).csv')
travel_train = pd.read_csv('/content/drive/MyDrive/Colab Files/shinkansen_datasets/Traveldata_train_(1).csv')

# storing a copy of original dat
survey_train_copy = survey_train.copy()
travel_train_copy = travel_train.copy()

In [None]:
survey_train.head()

Unnamed: 0,ID,Overall_Experience,Seat_Comfort,Seat_Class,Arrival_Time_Convenient,Catering,Platform_Location,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,0,Needs Improvement,Green Car,Excellent,Excellent,Very Convenient,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor
1,98800002,0,Poor,Ordinary,Excellent,Poor,Needs Improvement,Good,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good
2,98800003,1,Needs Improvement,Green Car,Needs Improvement,Needs Improvement,Needs Improvement,Needs Improvement,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent
3,98800004,0,Acceptable,Ordinary,Needs Improvement,,Needs Improvement,Acceptable,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable
4,98800005,1,Acceptable,Ordinary,Acceptable,Acceptable,Manageable,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Good


In [None]:
# Combine the datasets using the common column 'CustomerID'
df_train = pd.merge(travel_train, survey_train, on='ID', how='left')

# Display the first few rows of the combined dataframe to verify
display(df_train.head())

Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,...,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,0,...,Good,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,1,...,Needs Improvement,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,0,...,Acceptable,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,1,...,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Good


## Catboost Model

In [None]:
!pip install catboost -q

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X = df_train.drop(columns=['Overall_Experience'])
y = df_train['Overall_Experience']

# Identify categorical columns (optional: or provide a predefined list)
cat_cols = X.select_dtypes(include='object').columns.tolist()

# Fill missing values in categorical columns
for col in cat_cols:
    X[col] = X[col].fillna('Unknown')

# Create a validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

#Initialize and train model
model = CatBoostClassifier(
    iterations=10000,
    learning_rate=0.05,
    depth=12,
    l2_leaf_reg= 5,
    random_strength=1.8,
    border_count=254,
    loss_function='Logloss',
    eval_metric='Accuracy',
    od_type='Iter',          # enables early stopping
    od_wait=300,             # stop after 100 rounds of no improvement
    verbose=100
)

# Train CatBoost, passing categorical columns
model.fit(
    X_train, y_train,
    cat_features=cat_cols,
    eval_set=(X_val, y_val),
    use_best_model=True
)


# Evaluate
val_preds = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))

## Using Model on Test Data for Submission

In [None]:
# Importing the dataset from Google Drive
survey_test = pd.read_csv('/content/drive/MyDrive/Colab Files/shinkansen_datasets/Surveydata_test_(1).csv')
travel_test = pd.read_csv('/content/drive/MyDrive/Colab Files/shinkansen_datasets/Traveldata_test_(1).csv')

# Combine the test datasets
df_test = pd.merge(travel_test, survey_test, on='ID', how='left')

# Separate features and target (target is not in test data, but keep the structure)
# Assuming 'Overall_Experience' is not in the test set, if it is, you may need to drop it
if 'Overall_Experience' in df_test.columns:
    X_test = df_test.drop(columns=['Overall_Experience'])
else:
    X_test = df_test.copy()


# Identify categorical columns (should be the same as training data)
cat_cols_test = X_test.select_dtypes(include='object').columns.tolist()

# Fill missing values in categorical columns of the test data
for col in cat_cols_test:
    X_test[col] = X_test[col].fillna('Unknown')

# Make predictions on the test data
test_preds = model.predict(X_test)

# Display the predictions (optional)
print("Predictions on test data:")
print(test_preds)

Predictions on test data:
[1 1 1 ... 1 1 0]


In [None]:
# Create a DataFrame with 'ID' and 'Overall_Experience'
submission_df = pd.DataFrame({'ID': df_test['ID'], 'Overall_Experience': test_preds})

# Save the DataFrame to a CSV file
submission_df.to_csv('submission3.csv', index=False)

print("Submission file 'submission.csv' created successfully!")


Submission file 'submission.csv' created successfully!
