In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
# Load the datasets with 'latin1' encoding
train_df = pd.read_csv('store_train.csv', encoding='latin1')
test_df = pd.read_csv('store_test.csv', encoding='latin1')

# Display the first few rows of the training data
print(train_df.head())
print(train_df.info())
print(train_df.describe())

           Id  sales0  sales1  sales2  sales3  sales4  country  State  CouSub  \
0  2300919770     848     588     666    1116    1133        9     23   19770   
1  5000129575     925     717     780    1283    1550        1     50   29575   
2  2501308470     924     616     739    1154    1314       13     25    8470   
3   603599999     924     646     683    1292    1297       35      6   99999   
4  5002760100    1017     730     735    1208    1326       27     50   60100   

       countyname         storecode                            Areaname  \
0  Hancock County  NCNTY23009N23009                  Hancock County, ME   
1  Addison County  NCNTY50001N50001                  Addison County, VT   
2  Hampden County  METRO44140M44140  Springfield, MA HUD Metro FMR Area   
3   Lassen County  NCNTY06035N06035                   Lassen County, CA   
4  Windsor County  NCNTY50027N50027                  Windsor County, VT   

   countytownname  population state_alpha         store_Type  

In [3]:
# Display the first few rows of the training data
print(train_df.head())
print(train_df.info())
print(train_df.describe())

# Separate features and target variable from training data
X = train_df.drop(columns=['store'])
y = train_df['store']

           Id  sales0  sales1  sales2  sales3  sales4  country  State  CouSub  \
0  2300919770     848     588     666    1116    1133        9     23   19770   
1  5000129575     925     717     780    1283    1550        1     50   29575   
2  2501308470     924     616     739    1154    1314       13     25    8470   
3   603599999     924     646     683    1292    1297       35      6   99999   
4  5002760100    1017     730     735    1208    1326       27     50   60100   

       countyname         storecode                            Areaname  \
0  Hancock County  NCNTY23009N23009                  Hancock County, ME   
1  Addison County  NCNTY50001N50001                  Addison County, VT   
2  Hampden County  METRO44140M44140  Springfield, MA HUD Metro FMR Area   
3   Lassen County  NCNTY06035N06035                   Lassen County, CA   
4  Windsor County  NCNTY50027N50027                  Windsor County, VT   

   countytownname  population state_alpha         store_Type  

In [4]:
# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing for numerical data: impute missing values and scale
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data: impute missing values and one-hot encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', drop='first'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define the model
model = RandomForestClassifier(random_state=42)

# Create a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'model__n_estimators': [100, 200],
    'model__max_depth': [None, 10, 20],
    'model__min_samples_split': [2, 5],
    'model__min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X, y)

# Print the best parameters and best AUC score
print(f'Best parameters: {grid_search.best_params_}')
print(f'Best AUC score: {grid_search.best_score_}')

# Train the best model on the entire training data
best_model = grid_search.best_estimator_

# Predict probabilities on the test data
test_probs = best_model.predict_proba(test_df)[:, 1]

# Create a DataFrame with the results
submission_df = pd.DataFrame({'Probability': test_probs})

# Save the results to a CSV file
submission_df.to_csv('store_predictions2.csv', index=False)

Best parameters: {'model__max_depth': None, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 200}
Best AUC score: 0.8092080602178691


