In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

system_threat_forecaster_path = kagglehub.competition_download('System-Threat-Forecaster')

print('Data source import complete.')


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier

Loading train data

In [None]:
train_data = pd.read_csv('/kaggle/input/System-Threat-Forecaster/train.csv')
train_data.shape

Loading Test data

In [None]:
test_data = pd.read_csv('/kaggle/input/System-Threat-Forecaster/test.csv')
test_data.shape

# EDA

In [None]:
train_data.info()

In [None]:
train_null_percent=train_data.isnull().sum() / len(train_data) * 100
train_null_percent

In [None]:
test_null_percent=test_data.isnull().sum() / len(test_data) * 100
test_null_percent

For above two cells we can infer that the, percentage of missing values in each feature is very negligible(below 1%)
**No let us look at each feature separately**

In [None]:
train_data['MachineID'].nunique()

Feature MachineID contains 99835 distinct values, so it is not helpfull

In [None]:
train_data['ProductName'].value_counts()

In [None]:
train_data['ProductName'].value_counts().plot(kind='barh', color='skyblue')

# Display the plot
plt.title('Distribution of ProductName feature')
plt.xlabel('ProductName')
plt.ylabel('Count')
plt.show()

This ProductName feature has highly imbalanced binary values where win8defender is contributing maximum of the data  

In [None]:
train_data['EngineVersion'].nunique()

In [None]:
top_4 = train_data['EngineVersion'].value_counts().head(4)
other_count = train_data['EngineVersion'].value_counts().iloc[4:].sum()
top_5 = pd.concat([top_4,pd.Series({'Other': other_count})])

top_5.plot(kind='barh', color='skyblue')
plt.title('Distribution of EngineVersion')
plt.xlabel('Count')
plt.ylabel('EngineVersion')
plt.show()


The **EngineVersion** features looks good with 37 distinct values ans well distributed

In [None]:
train_data['AppVersion'].value_counts()

In [None]:
top_4 = train_data['AppVersion'].value_counts().head(4)
other_count = train_data['AppVersion'].value_counts().iloc[4:].sum()
top_5 = pd.concat([top_4,pd.Series({'Other': other_count})])

top_5.plot(kind='barh', color='skyblue')
plt.title('Distribution of AppVersion')
plt.xlabel('Count')
plt.ylabel('AppVersion')
plt.show()

The **AppVersion** feature also looks well with 69 distinct values and can be usefull

In [None]:
train_data['SignatureVersion'].value_counts()

In [None]:
top_4 = train_data['SignatureVersion'].value_counts().head(4)
other_count = train_data['SignatureVersion'].value_counts().iloc[4:].sum()

top_5 = pd.concat([top_4, pd.Series({'Other': other_count})])
top_5.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of SignatureVersion')
plt.ylabel('SignatureVersion')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

The feature SignatureVersion has 2735 distinct values. Being a categorical feature with high cardinality it will be diffucult to deal with it, so we may ignore it.

In [None]:
train_data['IsBetaUser'].value_counts()

IsBetaUser feature has only one value throughout the dataset. So this is useless feature.

In [None]:
train_data['RealTimeProtectionState'].value_counts()

In [None]:
top_4 = train_data['RealTimeProtectionState'].value_counts().head(4)
other_count = train_data['RealTimeProtectionState'].value_counts().iloc[4:].sum()

top_5 = pd.concat([top_4, pd.Series({'Other': other_count})])
top_5.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of RealTimeProtectionState')
plt.ylabel('RealTimeProtectionState')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
train_data['IsPassiveModeEnabled'].value_counts()

In [None]:
train_data['IsPassiveModeEnabled'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of IsPassiveModeEnabled')
plt.ylabel('IsPassiveModeEnabled')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

IsPassiveModeEnabled is a binary feature which can be usefull

In [None]:
train_data['AntivirusConfigID'].value_counts()

In [None]:
top_4 = train_data['AntivirusConfigID'].value_counts().head(4)
other_count = train_data['AntivirusConfigID'].value_counts().iloc[4:].sum()

top_5 = pd.concat([top_4, pd.Series({'Other': other_count})])
top_5.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of AntivirusConfigID')
plt.ylabel('AntivirusConfigID')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

With most frequent value contibuting more than 50% of the dataset, this feature (AntivirusConfigID) may not be usefull

In [None]:
train_data['NumAntivirusProductsInstalled'].value_counts()

In [None]:
train_data['NumAntivirusProductsInstalled'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of NumAntivirusProductsInstalled')
plt.ylabel('NumAntivirusProductsInstalled')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

In [None]:
train_data['NumAntivirusProductsEnabled'].value_counts()

In [None]:
train_data['NumAntivirusProductsEnabled'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of NumAntivirusProductsEnabled')
plt.ylabel('NumAntivirusProductsEnabled')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

With nearly 70% of the values of NumAntivirusProductsInstalled and 98% of values of NumAntivirusProductsEnabled being have only some particular value, these features are feature is extremely biased. so better we will not considert them

In [None]:
train_data['HasTpm'].value_counts()

In [None]:
train_data['HasTpm'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of HasTpm')
plt.ylabel('HasTpm')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

The binary feature HasTpm is extremly dominated by the value 1. This suggests the feature is almost constant, with only a small number of instances having the value 0. So we will not consider this feature.

In [None]:
train_data['CountryID'].value_counts()

In [None]:
top_9 = train_data['CountryID'].value_counts().head(9)
other_count = train_data['CountryID'].value_counts().iloc[9:].sum()

top_10 = pd.concat([top_9, pd.Series({'Other': other_count})])
top_10.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of CountryID')
plt.ylabel('CountryID')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

This feature seems to be well distributed and can be usefull

In [None]:
train_data['CityID'].value_counts()

In [None]:
train_data['GeoRegionID'].value_counts()

In [None]:
top_9 = train_data['GeoRegionID'].value_counts().head(9)
other_count = train_data['GeoRegionID'].value_counts().iloc[9:].sum()

top_10 = pd.concat([top_9, pd.Series({'Other': other_count})])
top_10.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of GeoRegionID')
plt.ylabel('GeoRegionID')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

With 16047 unique values, the feature CityID can be useless. moreover we can consider the feature GeoRegionID which will be more usefull.

In [None]:
train_data['LocaleEnglishNameID'].value_counts()

In [None]:
top_9 = train_data['LocaleEnglishNameID'].value_counts().head(9)
other_count = train_data['LocaleEnglishNameID'].value_counts().iloc[9:].sum()

top_10 = pd.concat([top_9, pd.Series({'Other': other_count})])
top_10.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of LocaleEnglishNameID')
plt.ylabel('LocaleEnglishNameID')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

The feature LocaleEnglishNameID with well distributed can be considered.

In [None]:
train_data['PlatformType'].value_counts()/len(train_data)*100

In [None]:
train_data['Processor'].value_counts()/len(train_data)*100

With almost 98% of values in PlatformType feature are same, this features can be useless.

In [None]:
train_data['OSVersion'].value_counts()

In [None]:
train_data['OSBuildNumber'].value_counts()

In [None]:
top_9 = train_data['OSBuildNumber'].value_counts().head(5)
other_count = train_data['OSBuildNumber'].value_counts().iloc[5:].sum()

top_10 = pd.concat([top_9, pd.Series({'Other': other_count})])
top_10.plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of OSBuildNumber')
plt.ylabel('OSBuildNumber')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

The feature OSBuildNumber can be considered over feature OSVersion

In [None]:
train_data['OSProductSuite'].value_counts()

In [None]:
train_data['OsPlatformSubRelease'].value_counts()

In [None]:
train_data['OsPlatformSubRelease'].value_counts().plot(kind='barh', color='skyblue', edgecolor='black')

plt.title('Distribution of OsPlatformSubRelease')
plt.ylabel('OsPlatformSubRelease')
plt.xlabel('Count')
plt.xticks(rotation=45)
plt.show()

The feature OSPlatformSubRelease is well distributed and can be usefull

Similarly performing EDA for rest cols

In [None]:
#ploting a heat map
numeric_type = train_data.select_dtypes(include=['float64', 'int64']).columns
corr_matrix = train_data[numeric_type].corr()
plt.figure(figsize=(50,30))
sns.heatmap(corr_matrix, annot= True)

From this feature heat map, we can see that many features have negative co relation with target variable. Since the features 'IsBetaUser', 'AutoSampleSubmissionEnabled', 'IsFlightDisabled' have only one unique value, we can see 3 white bars in the map.

In [None]:
X=train_data[["SystemVolumeCapacityMB", "OEMModelID", "CityID", "FirmwareVersionID",
    "InternalBatteryNumberOfCharges", "AntivirusConfigID", "PrimaryDiskCapacityMB",
    "ProcessorModelID", "TotalPhysicalRAMMB", "FirmwareManufacturerID", "IEVersionID",
    "GeoRegionID", "OSBuildRevisionOnly", "LocaleEnglishNameID", "CountryID",
    "OSBuildNumber", "OSUILocaleID", "IsSystemProtected", "IsGamer",
    "SignatureVersion", "OSBuildLab", "OSVersion", "EngineVersion",
    "LicenseActivationChannel", "OSGenuineState"]]
y=train_data['target']

In [None]:
X.info()

In [None]:
numerical_cols = [
    "SystemVolumeCapacityMB", "OEMModelID", "CityID", "FirmwareVersionID",
    "InternalBatteryNumberOfCharges", "AntivirusConfigID", "PrimaryDiskCapacityMB",
    "ProcessorModelID", "TotalPhysicalRAMMB", "FirmwareManufacturerID", "IEVersionID",
    "GeoRegionID", "OSBuildRevisionOnly", "LocaleEnglishNameID", "CountryID",
    "OSBuildNumber", "OSUILocaleID", "IsSystemProtected", "IsGamer"
]

categorical_cols = [
    "SignatureVersion", "OSBuildLab", "OSVersion", "EngineVersion",
    "LicenseActivationChannel", "OSGenuineState"
]

Splitting Dataset into Training and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
imp_most_frequent = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [None]:
X_train[numerical_cols]=imp_most_frequent.fit_transform(X_train[numerical_cols])
X_val[numerical_cols]=imp_most_frequent.transform(X_val[numerical_cols])

In [None]:
numerical_pipeline=Pipeline(steps=[("standardscaler",StandardScaler())])
categorical_pipeline = Pipeline(steps=[("onehot",OneHotEncoder(handle_unknown='ignore'))])

In [None]:
ct= ColumnTransformer(transformers=[('num',numerical_pipeline,numerical_cols),
                                  ('cat',categorical_pipeline,categorical_cols)])

In [None]:
X_train_transformed = ct.fit_transform(X_train)
X_val_transformed = ct.transform(X_val)

# Models

A simple **Logistic Regression** model
* build during intial days of the project, as a baseline or dummy model. (Model 1)

In [None]:
model = LogisticRegression(random_state=42, max_iter=200,solver='liblinear', C=1.0)
model.fit(X_train_transformed, y_train)
y_pred = model.predict(X_val_transformed)
accuracy = accuracy_score(y_val, y_pred)
print(f'Baseline Model Accuracy: {accuracy:.4f}')

**RidgeClassifier** (model 2)

In [None]:
ridge_model = RidgeClassifier(alpha=1.0)
ridge_model.fit(X_train_transformed, y_train)
y_pred = ridge_model.predict(X_val_transformed)
accuracy = accuracy_score(y_val, y_pred)
print(f'Ridge Classifier Accuracy: {accuracy:.4f}')


**SGD Classifier** (Model 3)
* The Stochastic Gradient Descent Classifier with hyperparameter tuning.  

In [None]:
param_dist = {
    'loss': ['log_loss', 'hinge', 'squared_hinge'],
    'alpha': [0.0001, 0.001, 0.01, 0.1],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.001, 0.01, 0.1]
}
sgd_model = SGDClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=sgd_model,
    param_distributions=param_dist,
    n_iter=20,
    cv=3,
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search.fit(X_train_transformed, y_train)
print(f"Best Parameters: {random_search.best_params_}")
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val_transformed)
accuracy = accuracy_score(y_val, y_pred)
print(f'Best SGD Model Accuracy: {accuracy:.4f}')


**RandomForestClassifier** (Model 4)
* Random Forest classifier with hyperparameter tuning.  

In [None]:
rf = RandomForestClassifier(random_state=42, n_jobs=-1)
param_dist = {
    'n_estimators': np.arange(100, 301, 100),
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=10,
    cv=2,
    verbose=2,
    n_jobs=-1,
    random_state=42
)
random_search.fit(X_train_transformed, y_train)
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_val_transformed)
accuracy = accuracy_score(y_val, y_pred)
print(f'Tuned Random Forest Model Accuracy: {accuracy:.4f}')
print(f'Best Parameters: {random_search.best_params_}')


**LightGBM  Classifier** (Model 5)
* LGBM with GBDT as the boosting type

In [None]:
lgbm = lgb.LGBMClassifier(boosting_type='gbdt', objective='binary', random_state=42)
param_dist = {
    'num_leaves': [20, 31, 40],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 300],
    'max_depth': [-1, 5, 10]
}
random_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=param_dist,
    n_iter=10,
    cv=2,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1,
    random_state=42
)
random_search.fit(X_train_transformed, y_train)
best_lgbm = random_search.best_estimator_
y_pred_lgbm = best_lgbm.predict(X_val_transformed)
accuracy_lgbm = accuracy_score(y_val, y_pred_lgbm)
print(f'LightGBM Model Accuracy: {accuracy_lgbm:.4f}')
print(f'Best Parameters: {random_search.best_params_}')


**Decision Tree Classifier** (Model 6)

In [None]:
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': np.arange(2, 20, 2),
    'min_samples_leaf': np.arange(1, 10, 2)
}
dt = DecisionTreeClassifier(random_state=42)
random_search = RandomizedSearchCV(dt, param_distributions=param_dist,
                                   n_iter=20, cv=5, scoring='accuracy',
                                   random_state=42, n_jobs=-1)
random_search.fit(X_train_transformed, y_train)
best_dt = random_search.best_estimator_
y_pred = best_dt.predict(X_val_transformed)
accuracy = accuracy_score(y_val, y_pred)
print(f'Optimized Decision Tree Accuracy: {accuracy:.4f}')
print(f'Best Parameters: {random_search.best_params_}')


# Ensemble Models

**Stacking Classifier** with Logistic regression as meta model and best models of Random Forest, LightGBM and decision tree as base models.

In [None]:
base_models = [
    ('random_forest', best_rf),
    ('lightgbm', best_lgbm),
    ('decision_tree', best_dt)
]
meta_model = LogisticRegression(max_iter=200, solver='liblinear')
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    n_jobs=-1
)
stacking_model.fit(X_train_transformed, y_train)
y_pred = stacking_model.predict(X_val_transformed)
stacking_accuracy = accuracy_score(y_val, y_pred)
print(f'Stacking Model Accuracy: {stacking_accuracy:.4f}')


**Voting Classifier** to combine prediction from best models of Random Forest, LightGBM and decision tree.

In [None]:
voting_model = VotingClassifier(estimators=[
    ('random_forest', best_rf),
    ('lightgbm', best_lgbm),
    ('decision_tree', best_dt)
], voting='hard')

voting_model.fit(X_train_transformed, y_train)
y_pred = voting_model.predict(X_val_transformed)
accuracy = accuracy_score(y_val, y_pred)
print(f'Voting Classifier Accuracy: {accuracy:.4f}')


# Test Dataset

In [None]:
test_data[numerical_cols]=imp_most_frequent.transform(test_data[numerical_cols])

In [None]:
X_test=test_data[["SystemVolumeCapacityMB", "OEMModelID", "CityID", "FirmwareVersionID",
    "InternalBatteryNumberOfCharges", "AntivirusConfigID", "PrimaryDiskCapacityMB",
    "ProcessorModelID", "TotalPhysicalRAMMB", "FirmwareManufacturerID", "IEVersionID",
    "GeoRegionID", "OSBuildRevisionOnly", "LocaleEnglishNameID", "CountryID",
    "OSBuildNumber", "OSUILocaleID", "IsSystemProtected", "IsGamer",
    "SignatureVersion", "OSBuildLab", "OSVersion", "EngineVersion",
    "LicenseActivationChannel", "OSGenuineState"]]

In [None]:
X_test_processed=ct.transform(X_test)

In [None]:
y_pred_submission=voting_model.predict(X_test_processed)

# Submission

In [None]:
sub=pd.DataFrame({"id": range(0,test_data.shape[0]),
                              "target": y_pred_submission})
sub.to_csv('submission.csv',index=False)

#