In [2]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [4]:
bd_train = pd.read_csv('counterfeit_train.csv')
bd_test = pd.read_csv('counterfeit_test.csv')

In [6]:
bd_train.head(10)

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402
5,JDG81,8.775,Area045,2000,165.5656,Antiseptics,mild,0.088881,DownTown,Tier 2,Unknown,3047.8464
6,KPX48,18.0,Area018,2007,156.9102,Hreplacements,critical,0.051685,Industrial,Tier 3,Medium,2883.3938
7,CYW14,7.68,Area013,1985,154.347,Antiseptics,mild,0.02815,DownTown,Tier 3,High,3262.234
8,TVF57,18.55,Area046,1995,107.3068,Hreplacements,critical,0.060266,DownTown,Tier 1,Small,1959.9292
9,MGU75,9.51,Area013,1985,161.1392,Statins,mild,0.023755,DownTown,Tier 3,High,3095.784


In [8]:
bd_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6818 entries, 0 to 6817
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Medicine_ID          6818 non-null   object 
 1   Counterfeit_Weight   5652 non-null   float64
 2   DistArea_ID          6818 non-null   object 
 3   Active_Since         6818 non-null   int64  
 4   Medicine_MRP         6818 non-null   float64
 5   Medicine_Type        6818 non-null   object 
 6   SidEffect_Level      6818 non-null   object 
 7   Availability_rating  6818 non-null   float64
 8   Area_Type            6818 non-null   object 
 9   Area_City_Type       6818 non-null   object 
 10  Area_dist_level      6818 non-null   object 
 11  Counterfeit_Sales    6818 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 639.3+ KB


In [10]:
bd_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705 entries, 0 to 1704
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Medicine_ID          1705 non-null   object 
 1   Counterfeit_Weight   1408 non-null   float64
 2   DistArea_ID          1705 non-null   object 
 3   Active_Since         1705 non-null   int64  
 4   Medicine_MRP         1705 non-null   float64
 5   Medicine_Type        1705 non-null   object 
 6   SidEffect_Level      1705 non-null   object 
 7   Availability_rating  1705 non-null   float64
 8   Area_Type            1705 non-null   object 
 9   Area_City_Type       1705 non-null   object 
 10  Area_dist_level      1705 non-null   object 
dtypes: float64(3), int64(1), object(7)
memory usage: 146.7+ KB


In [12]:
X = bd_train.drop(columns=['Counterfeit_Sales', 'Medicine_ID'])  # Exclude Medicine_ID
y = bd_train['Counterfeit_Sales']

In [14]:
for col in X.select_dtypes(include=['float64', 'int64']).columns:
    X[col].fillna(X[col].median(), inplace=True)

In [16]:
label_encoders = {}
for col in X.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col])
    label_encoders[col] = le

In [18]:
for col in bd_test.select_dtypes(include=['float64', 'int64']).columns:
    bd_test[col].fillna(bd_test[col].median(), inplace=True)

In [20]:
for col in bd_test.select_dtypes(include=['object']).columns:
    if col in label_encoders:
        bd_test[col] = label_encoders[col].transform(bd_test[col])

In [22]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
def train_and_evaluate_model(model, model_name):
    """
    Train the model, evaluate it on the validation set, and predict test values.
    
    Parameters:
    model: A scikit-learn estimator.
    model_name: Name of the model for logging purposes.
    
    Returns:
    test_predictions: Predicted values for the test set.
    """
    # Fit the model
    print(f"Training {model_name}...")  # Proper indentation
    model.fit(X_train, y_train)
    
    # Predict on validation data
    y_val_pred = model.predict(X_val)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
    print(f"{model_name} Validation RMSE: {rmse}")
    
    # Predict on test data
    test_features = bd_test.drop(columns=['Medicine_ID'])
    test_predictions = model.predict(test_features)
    
    return test_predictions

In [26]:
scaler = StandardScaler()

In [28]:
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
bd_test_scaled = scaler.transform(bd_test.drop(columns=['Medicine_ID']))

In [30]:
lr_model = LinearRegression()

In [32]:
lr_model.fit(X_train_scaled, y_train)

In [34]:
lr_val_pred = lr_model.predict(X_val_scaled)

In [36]:
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_val_pred))

In [38]:
print(f"Linear Regression Validation RMSE: {lr_rmse}")

Linear Regression Validation RMSE: 1134.9547614718535


In [40]:
lr_test_predictions = lr_model.predict(bd_test_scaled)

In [42]:
dt_param_grid = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [44]:
dt_model = DecisionTreeRegressor(random_state=42)
dt_grid_search = GridSearchCV(dt_model, dt_param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

In [46]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [48]:
dt_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [50]:
best_dt_model = dt_grid_search.best_estimator_

In [52]:
dt_val_pred = best_dt_model.predict(X_val)

In [54]:
dt_rmse = np.sqrt(mean_squared_error(y_val, dt_val_pred))
print(f"Decision Tree Validation RMSE: {dt_rmse}")

Decision Tree Validation RMSE: 1077.6061127674707


In [56]:
dt_test_predictions = best_dt_model.predict(bd_test.drop(columns=['Medicine_ID']))

In [58]:
rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [60]:
rf_model = RandomForestRegressor(random_state=42)

In [62]:
rf_grid_search = GridSearchCV(rf_model, rf_param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1)

In [66]:
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


In [68]:
rf_grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 162 candidates, totalling 486 fits


In [70]:
best_rf_model = rf_grid_search.best_estimator_

In [72]:
rf_val_pred = best_rf_model.predict(X_val)

In [74]:
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_val_pred))

In [76]:
print(f"Random Forest Validation RMSE: {rf_rmse}")
rf_test_predictions = best_rf_model.predict(bd_test.drop(columns=['Medicine_ID']))

Random Forest Validation RMSE: 1077.7975882980074


In [78]:
# Linear Regression Submission
lr_submission = pd.DataFrame({'Counterfeit_Sales': lr_test_predictions})
lr_submission.to_csv('submission_linear_regression.csv', index=False)

In [None]:
# Decision Tree Regressor Submission
dt_submission = pd.DataFrame({'Counterfeit_Sales': dt_test_predictions})
dt_submission.to_csv('submission_decision_tree.csv', index=False)

In [None]:
# Random Forest Regressor Submission
rf_submission = pd.DataFrame({'Counterfeit_Sales': rf_test_predictions})
rf_submission.to_csv('submission_random_forest.csv', index=False)