In [2]:
#uploading train.csv
import pandas as pd
from google.colab import files

uploaded = files.upload()

Saving train.csv to train.csv


In [3]:
#uploading train.csv
import pandas as pd
from google.colab import files

uploaded = files.upload()

Saving test.csv to test.csv


In [4]:
train_data = pd.read_csv("train.csv")  # Use the exact filename as uploaded
test_data = pd.read_csv("test.csv")    # Use the exact filename as uploaded


In [5]:
# Define the missing value threshold
missing_threshold = 0.6

# Step 1: Drop columns with excessive missing values in train_data
cols_to_keep = train_data.columns[train_data.isnull().mean() <= missing_threshold]
train_data = train_data[cols_to_keep]
test_data = test_data[cols_to_keep.intersection(test_data.columns)]

# Step 2: Impute missing numerical values with the median of train_data
numerical_cols = train_data.select_dtypes(include=['float64', 'int64']).columns
for col in numerical_cols:
    median_value = train_data[col].median()  # Compute median from train_data
    train_data.loc[:, col] = train_data[col].fillna(median_value)  # Impute train_data
    if col in test_data.columns:
        test_data.loc[:, col] = test_data[col].fillna(median_value)  # Impute test_data

# Step 3: Impute missing categorical values with the mode of train_data
categorical_cols = train_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    mode_value = train_data[col].mode()[0]  # Compute mode from train_data
    train_data.loc[:, col] = train_data[col].fillna(mode_value)  # Impute train_data
    if col in test_data.columns:
        test_data.loc[:, col] = test_data[col].fillna(mode_value)  # Impute test_data

# Output the results
print("Train Data Shape After Handling Missing Values:", train_data.shape)
print("Test Data Shape After Handling Missing Values:", test_data.shape)


Train Data Shape After Handling Missing Values: (12654, 24)
Test Data Shape After Handling Missing Values: (5852, 23)


In [6]:
from sklearn.preprocessing import StandardScaler

# Identify numerical columns present in both train and test datasets
common_numerical_cols = list(set(numerical_cols).intersection(set(test_data.columns)))

# Scale numerical features
scaler = StandardScaler()
train_data[common_numerical_cols] = scaler.fit_transform(train_data[common_numerical_cols])
test_data[common_numerical_cols] = scaler.transform(test_data[common_numerical_cols])


In [7]:
# Function to remove outliers using IQR
def remove_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    # Remove rows where values are outside the IQR bounds
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df

# Remove outliers in train_data for numerical columns
for col in numerical_cols:
    train_data = remove_outliers_iqr(train_data, col)

# Output the results
print("Outliers in train_data have been removed.")
print("Train Data Shape After Outlier Removal:", train_data.shape)


Outliers in train_data have been removed.
Train Data Shape After Outlier Removal: (7250, 24)


In [9]:
# Feature Engineering
import pandas as pd
import numpy as np
# 1. Create 'booked_package' column
train_data['booked_package'] = train_data[['package_accomodation', 'food_package', 'transport_package_mx',
                                           'sightseeing_package', 'guided_tour_package',
                                           'tour_arrangement']].any(axis=1)
test_data['booked_package'] = test_data[['package_accomodation', 'food_package', 'transport_package_mx',
                                         'sightseeing_package', 'guided_tour_package',
                                         'tour_arrangement']].any(axis=1)

# 2. Extract average age from 'age_bracket'
def extract_average_age(age_range):
    if '-' in age_range:
        start, end = age_range.split('-')
        return (int(start) + int(end)) // 2
    elif '<' in age_range:
        return 1  # Replace '<1' with 1
    else:
        try:
            return int(age_range)
        except ValueError:
            return np.nan

train_data['age_bracket'] = train_data['age_bracket'].apply(extract_average_age)
test_data['age_bracket'] = test_data['age_bracket'].apply(extract_average_age)

# 3. Create 'island_to_mainland_ratio' column
train_data['island_to_mainland_ratio'] = train_data['island_nights'] / (train_data['mainland_nights'] + 1e-5)
test_data['island_to_mainland_ratio'] = test_data['island_nights'] / (test_data['mainland_nights'] + 1e-5)

# 4. Create 'female_to_male_ratio' column
train_data['female_to_male_ratio'] = train_data['female_count'] / (train_data['male_count'] + 1e-5)
test_data['female_to_male_ratio'] = test_data['female_count'] / (test_data['male_count'] + 1e-5)

#5. Convert 'days_before_booked' to numeric
def days_to_numeric(days):
    if pd.isna(days):
        return np.nan
    elif '+' in days:
        return int(days.replace('+', ''))
    elif '-' in days:
        start, end = days.split('-')
        return (int(start) + int(end)) // 2
    else:
        return int(days)

train_data['days_before_booked_numeric'] = train_data['days_before_booked'].apply(days_to_numeric)
test_data['days_before_booked_numeric'] = test_data['days_before_booked'].apply(days_to_numeric)

def categorize_tour_length(length):
    if '7-14' in length:
        return 'Short'
    elif '30+' in length:
        return 'Long'
    else:
        return 'Medium'

train_data['tour_length_category'] = train_data['tour_length'].apply(categorize_tour_length)
test_data['tour_length_category'] = test_data['tour_length'].apply(categorize_tour_length)

#Verify the engineered features
print("Feature Engineering Completed.")
print("Train Data with New Features:", train_data.head())
print("Test Data with New Features:", test_data.head())

# Columns used to create 'booked_package'
package_columns = ['package_accomodation', 'food_package', 'transport_package_mx',
                   'sightseeing_package', 'guided_tour_package', 'tour_arrangement']

# Drop the redundant columns from both train_data and test_data
train_data = train_data.drop(columns=package_columns, errors='ignore')
test_data = test_data.drop(columns=package_columns, errors='ignore')

# Verify the remaining columns
print("Redundant package columns dropped.")
print("Train Data Columns After Dropping:", train_data.columns)
print("Test Data Columns After Dropping:", test_data.columns)


Feature Engineering Completed.
Train Data with New Features:            trip_ID visitor_nation  age_bracket travelling_with  female_count  \
3  tour_idkoh8mkgr          ITALY         34.0     With Spouse      0.044374   
4  tour_idkmsfa00a          ITALY         34.0     With Spouse      0.044374   
6  tour_iddge8fz8p          INDIA         34.0     With Spouse      0.044374   
7  tour_ida5537syq          KENYA         54.0           Alone     -0.744259   
8  tour_idhagcpzkz        AUSTRIA         21.0     With Spouse      0.044374   

   male_count        key_activity             trip_purpose first_time_visitor  \
3   -0.007617     Widlife Tourism     Leisure and Holidays                Yes   
4   -0.007617       Beach Tourism     Leisure and Holidays                Yes   
6   -0.007617     Hunting Tourism                 Business                 No   
7   -0.007617  Conference Tourism  Meetings and Conference                 No   
8   -0.007617     Widlife Tourism     Leisure and Hol

In [10]:
# Drop the specified columns from both train_data and test_data
columns_to_drop = ['female_count', 'male_count', 'island_nights', 'mainland_nights']

train_data.drop(columns=columns_to_drop, inplace=True)
test_data.drop(columns=columns_to_drop, inplace=True)

# Check the first few rows of train_data and test_data to confirm the columns are dropped
print(train_data.head())
print(test_data.head())


           trip_ID visitor_nation  age_bracket travelling_with  \
3  tour_idkoh8mkgr          ITALY         34.0     With Spouse   
4  tour_idkmsfa00a          ITALY         34.0     With Spouse   
6  tour_iddge8fz8p          INDIA         34.0     With Spouse   
7  tour_ida5537syq          KENYA         54.0           Alone   
8  tour_idhagcpzkz        AUSTRIA         21.0     With Spouse   

         key_activity             trip_purpose first_time_visitor  \
3     Widlife Tourism     Leisure and Holidays                Yes   
4       Beach Tourism     Leisure and Holidays                Yes   
6     Hunting Tourism                 Business                 No   
7  Conference Tourism  Meetings and Conference                 No   
8     Widlife Tourism     Leisure and Holidays                Yes   

  transport_package_international               source_of_info  \
3                              No  Travel agent, tour operator   
4                             Yes  Travel agent, tour op

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Create an imputer to fill missing values with the median
imputer = SimpleImputer(strategy='median')

# Impute missing values in the 'age_bracket' column
train_data['age_bracket'] = imputer.fit_transform(train_data[['age_bracket']])
test_data['age_bracket'] = imputer.fit_transform(test_data[['age_bracket']])
# Define the scaler
scaler = StandardScaler()

# Scale only the 'age_bracket' column in train_data and test_data
train_data['age_bracket'] = scaler.fit_transform(train_data[['age_bracket']])
test_data['age_bracket'] = scaler.fit_transform(test_data[['age_bracket']])

# Check the first few rows of train_data and test_data to confirm scaling
print(train_data[['age_bracket']].head())
print(test_data[['age_bracket']].head())


   age_bracket
3    -0.486059
4    -0.486059
6    -0.486059
7     1.350659
8    -1.679926
   age_bracket
0    -0.360737
1     1.392105
2     1.392105
3    -0.360737
4    -0.360737


In [12]:
# Drop 'days_before_booked' column from train_data and test_data
train_data = train_data.drop(columns=['days_before_booked'], errors='ignore')
test_data = test_data.drop(columns=['days_before_booked'], errors='ignore')

# Verify the remaining columns
print("Column 'days_before_booked' removed.")
print("Train Data Columns After Dropping:", train_data.columns)
print("Test Data Columns After Dropping:", test_data.columns)


Column 'days_before_booked' removed.
Train Data Columns After Dropping: Index(['trip_ID', 'visitor_nation', 'age_bracket', 'travelling_with',
       'key_activity', 'trip_purpose', 'first_time_visitor',
       'transport_package_international', 'source_of_info',
       'insurance_package', 'weather_at_arrival', 'tour_length', 'category',
       'booked_package', 'island_to_mainland_ratio', 'female_to_male_ratio',
       'days_before_booked_numeric', 'tour_length_category'],
      dtype='object')
Test Data Columns After Dropping: Index(['trip_ID', 'visitor_nation', 'age_bracket', 'travelling_with',
       'key_activity', 'trip_purpose', 'first_time_visitor',
       'transport_package_international', 'source_of_info',
       'insurance_package', 'weather_at_arrival', 'tour_length',
       'booked_package', 'island_to_mainland_ratio', 'female_to_male_ratio',
       'days_before_booked_numeric', 'tour_length_category'],
      dtype='object')


In [13]:
# Drop 'tour_length' column from train_data and test_data
train_data = train_data.drop(columns=['tour_length'], errors='ignore')
test_data = test_data.drop(columns=['tour_length'], errors='ignore')

# Verify the remaining columns
print("Column 'tour_length' removed.")
print("Train Data Columns After Dropping:", train_data.columns)
print("Test Data Columns After Dropping:", test_data.columns)
#Calculate the proportion of category 0, 1, and 2 for each visitor_nation



Column 'tour_length' removed.
Train Data Columns After Dropping: Index(['trip_ID', 'visitor_nation', 'age_bracket', 'travelling_with',
       'key_activity', 'trip_purpose', 'first_time_visitor',
       'transport_package_international', 'source_of_info',
       'insurance_package', 'weather_at_arrival', 'category', 'booked_package',
       'island_to_mainland_ratio', 'female_to_male_ratio',
       'days_before_booked_numeric', 'tour_length_category'],
      dtype='object')
Test Data Columns After Dropping: Index(['trip_ID', 'visitor_nation', 'age_bracket', 'travelling_with',
       'key_activity', 'trip_purpose', 'first_time_visitor',
       'transport_package_international', 'source_of_info',
       'insurance_package', 'weather_at_arrival', 'booked_package',
       'island_to_mainland_ratio', 'female_to_male_ratio',
       'days_before_booked_numeric', 'tour_length_category'],
      dtype='object')


In [14]:
# Calculate the proportion of category 0, 1, and 2 for each visitor_nation in train_data
category_proportion = train_data.groupby('visitor_nation')['category'].value_counts(normalize=True).unstack(fill_value=0)

# Function to assign spending category based on proportions
def spending_category(nation):
    # If the nation is not present in category_proportion, return 'unknown' or a default value
    if nation not in category_proportion.index:
        return 'unknown'

    # Otherwise, calculate the proportions
    high_spender_prop = category_proportion.loc[nation, 0]  # Proportion for high spender (category 0)
    moderate_spender_prop = category_proportion.loc[nation, 1]  # Proportion for moderate spender (category 1)
    low_spender_prop = category_proportion.loc[nation, 2]  # Proportion for low spender (category 2)

    # Determine the category with the highest proportion for the nationality
    if high_spender_prop > moderate_spender_prop and high_spender_prop > low_spender_prop:
        return 'high_spender'
    elif low_spender_prop > high_spender_prop and low_spender_prop > moderate_spender_prop:
        return 'low_spender'
    else:
        return 'moderate_spender'

# Apply the function to assign 'visitor_spending_category' to train_data
train_data['visitor_spending_category'] = train_data['visitor_nation'].map(spending_category)

# Similarly, apply the function to assign 'visitor_spending_category' to test_data
test_data['visitor_spending_category'] = test_data['visitor_nation'].map(spending_category)

# Check the new feature in both datasets
print("Train Data with New Feature:")
print(train_data[['visitor_nation', 'category', 'visitor_spending_category']].head())

print("Test Data with New Feature:")
print(test_data[['visitor_nation', 'visitor_spending_category']].head())



Train Data with New Feature:
  visitor_nation  category visitor_spending_category
3          ITALY       0.0              high_spender
4          ITALY       0.0              high_spender
6          INDIA       0.0          moderate_spender
7          KENYA       2.0          moderate_spender
8        AUSTRIA       0.0              high_spender
Test Data with New Feature:
  visitor_nation visitor_spending_category
0          CONGO          moderate_spender
1     SWIZERLAND              high_spender
2         MEXICO          moderate_spender
3          JAPAN          moderate_spender
4          SPAIN              high_spender


In [15]:
# Label encoding

# Concatenate train_data and test_data for consistent encoding
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Identify categorical columns
categorical_cols = combined_data.select_dtypes(include=['object']).columns
categorical_cols = [col for col in categorical_cols if col != 'trip_ID']
# Initialize a dictionary to store mappings for each categorical column
label_mappings = {}

# Label encode each categorical column
for col in categorical_cols:
    # Get unique values and assign incremental labels
    unique_values = combined_data[col].dropna().unique()
    col_mapping = {val: idx + 1 for idx, val in enumerate(sorted(unique_values))}  # Map to 1, 2, 3...

    # Apply mapping to the column
    combined_data[col] = combined_data[col].map(col_mapping)

    # Store the mapping for later use
    label_mappings[col] = col_mapping

# Split back into train_data and test_data
train_data = combined_data.iloc[:len(train_data)].reset_index(drop=True)
test_data = combined_data.iloc[len(train_data):].reset_index(drop=True)

# Verify the results
print("Categorical columns label-encoded.")
print("Train Data Shape:", train_data.shape)
print("Test Data Shape:", test_data.shape)
print("Label Mappings:", label_mappings)

Categorical columns label-encoded.
Train Data Shape: (7250, 18)
Test Data Shape: (5852, 18)
Label Mappings: {'visitor_nation': {'AFGHANISTAN': 1, 'ALGERIA': 2, 'ANGOLA': 3, 'ARGENTINA': 4, 'ARMENIA': 5, 'AUSTRALIA': 6, 'AUSTRIA': 7, 'BAHRAIN': 8, 'BANGLADESH': 9, 'BARBADOS': 10, 'BELGIUM': 11, 'BERMUDA': 12, 'BOSNIA': 13, 'BOTSWANA': 14, 'BRAZIL': 15, 'BULGARIA': 16, 'BURGARIA': 17, 'BURUNDI': 18, 'CAMBODIA': 19, 'CAMEROON': 20, 'CANADA': 21, 'CAPE VERDE': 22, 'CHILE': 23, 'CHINA': 24, 'COLOMBIA': 25, 'COMORO': 26, 'CONGO': 27, 'COSTARICA': 28, 'CROATIA': 29, 'CYPRUS': 30, 'CZECH REPUBLIC': 31, 'DENMARK': 32, 'DJIBOUT': 33, 'DOMINICA': 34, 'DRC': 35, 'ECUADO': 36, 'EGYPT': 37, 'ERITREA': 38, 'ESTONIA': 39, 'ETHIOPIA': 40, 'FINLAND': 41, 'FRANCE': 42, 'GAMBIA': 43, 'GEORGIA': 44, 'GERMANY': 45, 'GHANA': 46, 'GREECE': 47, 'HUNGARY': 48, 'ICELAND': 49, 'INDIA': 50, 'INDONESIA': 51, 'IRAN': 52, 'IRELAND': 53, 'ISRAEL': 54, 'ITALY': 55, 'JAMAICA': 56, 'JAPAN': 57, 'JORDAN': 58, 'KENYA': 59,

In [16]:
# Split train_data into features and target
X = train_data.drop(columns=['category', 'trip_ID'])
y = train_data['category']

# Split into training and validation sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Ensure no missing values in X_train and X_val
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import SGD, Adam, Adadelta, RMSprop, Adagrad, Adamax
from tensorflow.keras.regularizers import l2
from tensorflow.keras.utils import to_categorical

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_val)
log_reg_acc = accuracy_score(y_val, y_pred_log)
print("Logistic Regression Accuracy:", log_reg_acc)

# Support Vector Machine (SVM)
svm = SVC()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_val)
svm_acc = accuracy_score(y_val, y_pred_svm)
print("SVM Accuracy:", svm_acc)

# Neural Network Variations
def create_nn(input_dim, optimizer, dropout_rate=0.2, reg=0.01, hidden_layer_size=256):
    model = Sequential()
    model.add(Dense(hidden_layer_size, input_dim=input_dim, activation='relu', kernel_regularizer=l2(reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(hidden_layer_size // 2, activation='relu', kernel_regularizer=l2(reg)))
    model.add(Dropout(dropout_rate))
    model.add(Dense(3, activation='softmax'))  # Assuming 3 output classes
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# One-hot encode the labels for NN
y_train_nn = to_categorical(y_train)
y_val_nn = to_categorical(y_val)

# Optimizers to try
optimizers = [
        SGD(learning_rate=0.01, momentum=0.9),
    Adam(learning_rate=0.001),
    Adadelta(learning_rate=1.0),
    RMSprop(learning_rate=0.001),
    Adagrad(learning_rate=0.01),
    Adamax(learning_rate=0.002)
]

nn_accuracies = {}
input_dim = X_train.shape[1]

for optimizer in optimizers:
    model = create_nn(input_dim, optimizer, dropout_rate=0.3, reg=0.01, hidden_layer_size=256)
    model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0, validation_data=(X_val, y_val_nn))
    loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
    nn_accuracies[str(optimizer)] = acc
    print(f"NN with {optimizer.__class__.__name__} Accuracy: {acc}")

# Select the best model
best_model = None
best_model_type = ""
if log_reg_acc >= svm_acc and log_reg_acc >= max(nn_accuracies.values()):
    best_model = log_reg
    best_model_type = "Logistic Regression"
elif svm_acc >= log_reg_acc and svm_acc >= max(nn_accuracies.values()):
    best_model = svm
    best_model_type = "SVM"
else:
    best_optimizer = max(nn_accuracies, key=nn_accuracies.get)
    best_model = create_nn(input_dim, eval(best_optimizer))
    best_model_type = f"Neural Network with {best_optimizer}"

print(f"Best Model: {best_model_type}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.716551724137931
SVM Accuracy: 0.6496551724137931


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NN with SGD Accuracy: 0.6496551632881165


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NN with Adam Accuracy: 0.6951724290847778


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NN with Adadelta Accuracy: 0.6979310512542725


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NN with RMSprop Accuracy: 0.7027586102485657


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NN with Adagrad Accuracy: 0.6593103408813477


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


NN with Adamax Accuracy: 0.6937931180000305
Best Model: Logistic Regression


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.regularizers import l2

# def create_nn(input_dim, dropout_rate=0.3, reg=0.01):
#     model = Sequential()
#     model.add(Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(64, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(32, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(16, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(3, activation='softmax'))  # Assuming 3 output classes
#     return model


In [31]:

# Drop the 'trip_ID' column from X_test as it's not used for prediction
X_test = test_data.drop(columns=['trip_ID'])

# Ensure X_test has the same columns as the training data X_train
# Assuming 'X_train' is the training data used to fit the model, get its columns:
X_train_columns = X_train.columns

# Align X_test columns with X_train (adding missing columns with default values, like 0)
X_test = X_test.reindex(columns=X_train_columns, fill_value=0)

# Make predictions on the test data using the aligned features
test_data['category'] = best_model.predict(X_test)

# Create the submission dataframe with 'trip_ID' and 'category'
submission = test_data[['trip_ID', 'category']]

# Save the submission to a CSV file
submission.to_csv('submission.csv', index=False)

# Print success message
print("Submission file created: submission.csv")


Submission file created: submission.csv


In [32]:
from google.colab import files

# Download the file
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Different Models tried :

In [21]:
# # Required Libraries
# import pandas as pd
# import numpy as np
# from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import f1_score
# from sklearn.neural_network import MLPClassifier
# from xgboost import XGBClassifier

# # Training data preparation (assuming X_train, X_val, y_train, y_val are pre-split)
# # Test data preparation (assuming test_df is preprocessed and available)

# # Define individual models
# rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
# xgb_model = XGBClassifier(random_state=42, n_estimators=100)
# lr_model = LogisticRegression(random_state=42, max_iter=200)
# gb_model = GradientBoostingClassifier(random_state=42, n_estimators=100)
# nn_model = MLPClassifier(random_state=42, max_iter=300)  # Neural Network

# # Ensemble Voting Classifier
# voting_clf = VotingClassifier(estimators=[
#     ('rf', rf_model),
#     ('xgb', xgb_model),
#     ('lr', lr_model),
#     ('gb', gb_model),
#     ('nn', nn_model)
# ], voting='soft')

# # Training the Voting Classifier
# voting_clf.fit(X_train, y_train)

# # Evaluate on validation data
# y_pred = voting_clf.predict(X_val)
# f1 = f1_score(y_val, y_pred, average='weighted')
# print(f"F1 Score on Validation Set: {f1}")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score on Validation Set: 0.7367426678294142


In [22]:

# # Drop the 'trip_ID' column from X_test as it's not used for prediction
# X_test = test_data.drop(columns=['trip_ID'])

# # Ensure X_test has the same columns as the training data X_train
# # Assuming 'X_train' is the training data used to fit the model, get its columns:
# X_train_columns = X_train.columns

# # Align X_test columns with X_train (adding missing columns with default values, like 0)
# X_test = X_test.reindex(columns=X_train_columns, fill_value=0)

# # Make predictions on the test data using the aligned features
# test_data['category'] = voting_clf.predict(X_test)

# # Create the submission dataframe with 'trip_ID' and 'category'
# submission = test_data[['trip_ID', 'category']]

# # Save the submission to a CSV file
# submission.to_csv('submission.csv', index=False)

# # Print success message
# print("Submission file created: submission.csv")

# from google.colab import files

# # Download the file
# files.download('submission.csv')


Submission file created: submission.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# from sklearn.neural_network import MLPClassifier  # For NN

# base_learners = [
#     ('rf', RandomForestClassifier(random_state=42, n_estimators=100)),
#     ('xgb', XGBClassifier(random_state=42, n_estimators=100)),
#     ('gb', GradientBoostingClassifier(random_state=42, n_estimators=100)),
#     ('ab', AdaBoostClassifier(algorithm='SAMME', random_state=42, n_estimators=100)),
#     ('lgbm', LGBMClassifier(random_state=42, n_estimators=100)),
#     ('lr', LogisticRegression(random_state=42, max_iter=1000)),  # Logistic Regression
#     ('nn', MLPClassifier(random_state=42, max_iter=300))         # Neural Network
# ]

# meta_model =  LogisticRegression(random_state=42)
# # meta_model =  MLPClassifier(random_state=42)
# # meta_model =  LGBMClassifier(random_state=42)
# # meta_model =  GradientBoostingClassifier(random_state=42)

# stacking_clf = StackingClassifier(estimators=base_learners, final_estimator=meta_model)


# stacking_clf.fit(X_train, y_train)
# y_pred = stacking_clf.predict(X_val)
# f1 = f1_score(y_val, y_pred, average='weighted')
# print(f"F1 Score on Validation Set: {f1}")

# test_predictions = stacking_clf.predict(X_test)
# submission_df = pd.DataFrame({
#     'trip_ID': test_df['trip_ID'],
#     'category': test_predictions
# })

# # Save to CSV
# submission_df.to_csv('submission_stacking_classifier_with_nn_lr.csv', index=False)

# print("Submission file 'submission_stacking_classifier_with_nn_lr.csv' created successfully.")

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score

# log_reg = LogisticRegression(max_iter=1000)
# log_reg.fit(X_train, y_train)
# y_pred_log = log_reg.predict(X_val)
# log_reg_acc = accuracy_score(y_val, y_pred_log)
# print("Logistic Regression Accuracy:", log_reg_acc)


In [None]:
# from sklearn.svm import SVC

# svm = SVC()
# svm.fit(X_train, y_train)
# y_pred_svm = svm.predict(X_val)
# svm_acc = accuracy_score(y_val, y_pred_svm)
# print("SVM Accuracy:", svm_acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.regularizers import l2

# def create_nn(input_dim, dropout_rate=0.3, reg=0.01):
#     model = Sequential()
#     model.add(Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(64, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(32, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(16, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(3, activation='softmax'))
#     return model


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.utils import to_categorical
# from sklearn.metrics import accuracy_score

# y_train_nn = to_categorical(y_train)
# y_val_nn = to_categorical(y_val)

# # Simple Neural Network without hidden layers
# model = Sequential()
# model.add(Dense(y_train_nn.shape[1], input_dim=X_train.shape[1], activation='softmax'))  # Output layer

# # Compiling the model with Adam optimizer
# adam_optimizer = Adam(learning_rate=0.001)  # You can adjust the learning rate here
# model.compile(optimizer=adam_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Training the model
# model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0)

# # Evaluate the model
# loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
# print("Neural Network with Adam Accuracy:", acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import SGD
# from tensorflow.keras.utils import to_categorical
# from sklearn.metrics import accuracy_score

# y_train_nn = to_categorical(y_train)
# y_val_nn = to_categorical(y_val)

# # Simple Neural Network without hidden layers
# model = Sequential()
# model.add(Dense(y_train_nn.shape[1], input_dim=X_train.shape[1], activation='softmax'))  # Output layer

# # Compiling the model with SGD optimizer
# sgd_optimizer = SGD(learning_rate=0.01)  # You can adjust the learning rate here
# model.compile(optimizer=sgd_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0)

# # Evaluate the model
# loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
# print("Neural Network with SGD Accuracy:", acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adadelta
# from tensorflow.keras.utils import to_categorical
# from sklearn.metrics import accuracy_score

# y_train_nn = to_categorical(y_train)
# y_val_nn = to_categorical(y_val)

# # Simple Neural Network without hidden layers
# model = Sequential()
# model.add(Dense(y_train_nn.shape[1], input_dim=X_train.shape[1], activation='softmax'))  # Output layer

# # Compiling the model with Adadelta optimizer
# adadelta_optimizer = Adadelta(learning_rate=1.0)  # You can adjust the learning rate here
# model.compile(optimizer=adadelta_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Training the model
# model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0)

# # Evaluate the model
# loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
# print("Neural Network with Adadelta Accuracy:", acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import RMSprop
# from tensorflow.keras.utils import to_categorical
# from sklearn.metrics import accuracy_score

# y_train_nn = to_categorical(y_train)
# y_val_nn = to_categorical(y_val)

# # Simple Neural Network without hidden layers
# model = Sequential()
# model.add(Dense(y_train_nn.shape[1], input_dim=X_train.shape[1], activation='softmax'))  # Output layer

# # Compiling the model with RMSprop optimizer
# rmsprop_optimizer = RMSprop(learning_rate=0.001)  # You can adjust the learning rate here
# model.compile(optimizer=rmsprop_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Training the model
# model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0)

# # Evaluate the model
# loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
# print("Neural Network with RMSprop Accuracy:", acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adamax
# from tensorflow.keras.utils import to_categorical
# from sklearn.metrics import accuracy_score

# y_train_nn = to_categorical(y_train)
# y_val_nn = to_categorical(y_val)

# # Simple Neural Network without hidden layers
# model = Sequential()
# model.add(Dense(y_train_nn.shape[1], input_dim=X_train.shape[1], activation='softmax'))  # Output layer

# # Compiling the model with Adamax optimizer
# adamax_optimizer = Adamax(learning_rate=0.002)  # You can adjust the learning rate here
# model.compile(optimizer=adamax_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Training the model
# model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0)

# # Evaluate the model
# loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
# print("Neural Network with Adamax Accuracy:", acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense
# from tensorflow.keras.optimizers import Adagrad
# from tensorflow.keras.utils import to_categorical
# from sklearn.metrics import accuracy_score

# y_train_nn = to_categorical(y_train)
# y_val_nn = to_categorical(y_val)

# # Simple Neural Network without hidden layers
# model = Sequential()
# model.add(Dense(y_train_nn.shape[1], input_dim=X_train.shape[1], activation='softmax'))  # Output layer

# # Compiling the model with Adagrad optimizer
# adagrad_optimizer = Adagrad(learning_rate=0.01)  # You can adjust the learning rate here
# model.compile(optimizer=adagrad_optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# # Training the model
# model.fit(X_train, y_train_nn, epochs=50, batch_size=32, verbose=0)

# # Evaluate the model
# loss, acc = model.evaluate(X_val, y_val_nn, verbose=0)
# print("Neural Network with AdaGrad Accuracy:", acc)


In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense, Dropout
# from tensorflow.keras.regularizers import l2

# def create_nn(input_dim, dropout_rate=0.3, reg=0.01):
#     model = Sequential()
#     model.add(Dense(128, input_dim=input_dim, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(64, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(32, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(16, activation='relu', kernel_regularizer=l2(reg)))
#     model.add(Dropout(dropout_rate))
#     model.add(Dense(3, activation='softmax'))  # Assuming 3 output classes
#     model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
#     return model
