In [None]:
!pip install kaggle  # Install Kaggle API
from google.colab import files

# Upload your Kaggle API key (kaggle.json)
files.upload()

# Move kaggle.json to the correct directory
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json  # Set permissions




Saving kaggle.json to kaggle.json


In [None]:
# Download the dataset
!kaggle competitions download -c cat-in-the-dat-ii

# Unzip the dataset
!unzip cat-in-the-dat-ii.zip


Downloading cat-in-the-dat-ii.zip to /content
 99% 41.0M/41.3M [00:01<00:00, 32.2MB/s]
100% 41.3M/41.3M [00:01<00:00, 25.2MB/s]
Archive:  cat-in-the-dat-ii.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [None]:
# Sample feature engineering steps
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import pandas as pd

# Load the train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


In [None]:
# Check for missing values
train.isnull().sum()

# Example of filling missing values (if any)
train.fillna(method='ffill', inplace=True)


  train.fillna(method='ffill', inplace=True)


In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Label Encoding for nominal categorical columns
le = LabelEncoder()
nominal_cols = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
for col in nominal_cols:
    train[col] = le.fit_transform(train[col])

# Ordinal Encoding for ordinal columns (adjust the mapping accordingly)
ordinal_mapping = {
    'ord_0': {'Low': 0, 'Medium': 1, 'High': 2},
    'ord_1': {'Low': 0, 'Medium': 1, 'High': 2},
    'ord_2': {'Low': 0, 'Medium': 1, 'High': 2},
    'ord_3': {'Low': 0, 'Medium': 1, 'High': 2},
    'ord_4': {'Low': 0, 'Medium': 1, 'High': 2},
    'ord_5': {'Low': 0, 'Medium': 1, 'High': 2}
}

for col, mapping in ordinal_mapping.items():
    train[col] = train[col].map(mapping)

# Date Feature Engineering (day_of_week and is_weekend)
train['date'] = pd.to_datetime(train[['day', 'month']].assign(year=2025))  # Assumed year, adjust if necessary
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
train.drop(columns=['date'], inplace=True)



In [None]:
from sklearn.model_selection import train_test_split

# Separate features and target
X = train.drop(columns=['target', 'id'])  # Dropping 'id' and 'target'
y = train['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load data
train = pd.read_csv('train.csv')

# Identify categorical columns (those with non-numeric data)
non_numeric_cols = train.select_dtypes(include=['object']).columns
print(f"Non-numeric columns: {non_numeric_cols}")

# Encode binary categorical columns (e.g., 'T' -> 1, 'F' -> 0)
binary_cols = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
for col in binary_cols:
    train[col] = train[col].map({'T': 1, 'F': 0})

# Encode all other categorical columns (nominal and ordinal)
for col in non_numeric_cols:
    if col not in binary_cols:  # Skip binary columns
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col].astype(str))  # Ensure conversion to string before encoding

# Feature engineering for date-related columns
train['date'] = pd.to_datetime(train[['day', 'month']].assign(year=2025))  # Assuming year 2025
train['day_of_week'] = train['date'].dt.dayofweek
train['is_weekend'] = train['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
train.drop(columns=['date'], inplace=True)

# Prepare features and target
X = train.drop(columns=['target', 'id'])  # Drop 'id' and 'target'
y = train['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize RandomForest model
rf = RandomForestClassifier(random_state=42)

# Define parameter grid for GridSearch
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters found by GridSearch
print(f"Best parameters: {grid_search.best_params_}")

# Evaluate on the validation set
y_pred = grid_search.predict(X_val)

# Accuracy and classification report
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_val, y_pred))

# Alternatively, you can use RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distribution for RandomizedSearch
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

# Perform Randomized Search
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

# Best parameters found by RandomizedSearch
print(f"RandomizedSearch best parameters: {random_search.best_params_}")

# Evaluate on the validation set
y_pred_random = random_search.predict(X_val)

# Accuracy and classification report for RandomizedSearch
accuracy_random = accuracy_score(y_val, y_pred_random)
print(f"RandomizedSearch Accuracy: {accuracy_random}")
print(classification_report(y_val, y_pred_random))


Non-numeric columns: Index(['bin_3', 'bin_4', 'nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4', 'nom_5',
       'nom_6', 'nom_7', 'nom_8', 'nom_9', 'ord_1', 'ord_2', 'ord_3', 'ord_4',
       'ord_5'],
      dtype='object')
Fitting 5 folds for each of 81 candidates, totalling 405 fits


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(y_val, y_pred))


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# Define parameter distribution for RandomizedSearch
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 4)
}

# Perform Randomized Search
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=100, cv=5, n_jobs=-1, verbose=2)
random_search.fit(X_train, y_train)

# Best parameters found by RandomizedSearch
print(random_search.best_params_)

# Evaluate on the validation set
y_pred_random = random_search.predict(X_val)

# Accuracy and classification report for RandomizedSearch
accuracy_random = accuracy_score(y_val, y_pred_random)
print(f"RandomizedSearch Accuracy: {accuracy_random}")
print(classification_report(y_val, y_pred_random))
