In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import cross_val_score

In [None]:
# Load the dataset
data_path = 'Log_Reg_train.csv'
df_train = pd.read_csv(data_path)

df_train.head()

Unnamed: 0,ID,totalSales,CompetitorPrice,Income,Advertising,Population,Price,Location,Age,Education,Urban,US
0,1,6.01,131,29,11,335,127,Bad,33,12,Yes,Yes
1,2,6.5,148,51,16,148,150,Medium,58,17,No,Yes
2,3,7.7,118,71,12,44,89,Medium,67,18,No,Yes
3,4,7.78,86,54,0,497,64,Bad,33,12,Yes,No
4,5,11.82,113,66,16,322,74,Good,76,15,Yes,Yes


In [3]:
df_train['Category'] = df_train['totalSales'].apply(lambda x: 1 if x <= 5 else (2 if x <= 10 else 3))
df_train = pd.get_dummies(df_train, columns=['Location', 'Urban', 'US'], drop_first=True)

In [4]:
# Normalize numeric features
scaler = StandardScaler()
numeric_cols = ['CompetitorPrice', 'Income', 'Advertising', 'Population', 'Price', 'Age', 'Education']
df_train[numeric_cols] = scaler.fit_transform(df_train[numeric_cols])

In [5]:
# Define features (X) and target (y)
X_features = df_train.drop(['ID', 'totalSales', 'Category'], axis=1)
y_target = df_train['Category']

In [6]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.2, random_state=43, stratify=y_target)

In [7]:
# Initialize Logistic Regression model
logistic_model = LogisticRegression(multi_class='ovr', solver='liblinear')
logistic_model.fit(X_train, y_train)

In [8]:
# Set up hyperparameter grid for GridSearchCV
hyperparameter_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001,0.1, 1, 10, 100],
    'solver': ['liblinear']
}

In [9]:
# Perform grid search to find the best model
grid_search_cv = GridSearchCV(logistic_model, hyperparameter_grid, cv=3, scoring='accuracy')
grid_search_cv.fit(X_train, y_train)

In [10]:
best_logistic_model = grid_search_cv.best_estimator_

# Predict on the test set
y_predicted = best_logistic_model.predict(X_test)

In [11]:
# Evaluate the model
print(f'Accuracy: {accuracy_score(y_test, y_predicted)}')
print(classification_report(y_test, y_predicted))

Accuracy: 0.8285714285714286
              precision    recall  f1-score   support

           1       0.70      0.54      0.61        13
           2       0.83      0.91      0.87        43
           3       0.92      0.86      0.89        14

    accuracy                           0.83        70
   macro avg       0.82      0.77      0.79        70
weighted avg       0.82      0.83      0.82        70



In [12]:
# Perform cross-validation
cv_scores = cross_val_score(best_logistic_model, X_features, y_target, cv=3, scoring='accuracy')
print(f'Cross-Validation Accuracy: {cv_scores.mean()}')

Cross-Validation Accuracy: 0.8050152274290205


In [None]:
test_data_path = 'Log_Reg_test.csv'
df_test = pd.read_csv(test_data_path)

In [14]:
# Preserve the ID column for submission
test_ID = df_test.ID

# Display the first few rows of the test dataframe
df_test.head()

# Convert categorical variables into dummy/indicator variables for the test set
df_test = pd.get_dummies(df_test, columns=['Location', 'Urban', 'US'], drop_first=True)

# Normalize numeric features in the test set
df_test[numeric_cols] = scaler.transform(df_test[numeric_cols])

In [15]:
# Handle any missing columns in the test set that were in the training set
missing_columns = set(X_train.columns) - set(df_test.columns)
for col in missing_columns:
    df_test[col] = 0

In [16]:
# Ensure the test set has the same columns as the training set
df_test = df_test[X_train.columns]

# Predict categories for the test set
test_predictions = best_logistic_model.predict(df_test)

# Create a submission dataframe
submission_df = pd.DataFrame({
    'ID': test_ID,
    'Category': test_predictions
})

In [17]:
# Save the submission dataframe to a CSV file
submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created successfully.")

Submission file 'submission.csv' created successfully.
