In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.preprocessing import LabelEncoder
import joblib


In [None]:
df = pd.read_csv('/kaggle/input/playground-series-s4e6/train.csv')

In [None]:
df.head()

In [None]:
df.drop(['id'],axis=1,inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
num_cols = df.columns.tolist()
for col in num_cols:
    if(col != 'Target'):
        contingency_table = pd.crosstab(df[col], df['Target'])
        print(contingency_table)

In [None]:
all_cols = df.columns.tolist()
num_cols = ['Previous qualification (grade)', 'Admission grade', 'Age at enrollment', 'Curricular units 1st sem (credited)', 'Curricular units 1st sem (enrolled)', 'Curricular units 1st sem (evaluations)', 'Curricular units 1st sem (approved)', 'Curricular units 1st sem (grade)', 'Curricular units 1st sem (without evaluations)', 'Curricular units 2nd sem (credited)', 'Curricular units 2nd sem (enrolled)', 'Curricular units 2nd sem (evaluations)', 'Curricular units 2nd sem (approved)', 'Curricular units 2nd sem (grade)', 'Curricular units 2nd sem (without evaluations)', 'Unemployment rate', 'Inflation rate', 'GDP']
cat_cols = [col for col in all_cols if col not in num_cols and col != 'Target']
df[cat_cols] = df[cat_cols].astype('object')

In [None]:
def filter_by_target_frequency(df, target_column, threshold=0.5):
    threshold_value = int(threshold * len(df))
    counts = Counter(df[target_column])
    
    categories_to_remove = []
    s = 0
    
    for category, count in counts.items():
        s += count
        if s < threshold_value:
            categories_to_remove.append(category)
        else:
            break
            
    filtered_df = df[~df[target_column].isin(categories_to_remove)]
    
    return filtered_df

In [None]:
for col in cat_cols:
    df[col] = df[col].astype(str)

In [None]:
categorical_encoder = LabelEncoder()
for col in cat_cols:
    df[col] = categorical_encoder.fit_transform(df[col])

In [None]:
df.shape

In [None]:
target_encoder = LabelEncoder()
df['Target'] = target_encoder.fit_transform(df['Target'])

In [None]:
X = df.drop(['Target'], axis = 1)
y = df['Target']

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(random_state=42, n_estimators = 200, max_depth=20)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s4e6/test.csv')

In [None]:
test_df.head()

In [None]:
ids = test_df['id']
test_df.drop(['id'], axis=1, inplace = True)

In [None]:
test_df.head()

In [None]:
test_df[cat_cols] = test_df[cat_cols].astype('object')

In [None]:
for col in cat_cols:
    test_df[col] = test_df[col].astype(str)

In [None]:
for col in cat_cols:
    test_df[col] = categorical_encoder.fit_transform(test_df[col])

In [None]:
test_df[num_cols] = scaler.fit_transform(test_df[num_cols])

In [None]:
final_pred = model.predict(test_df)
submission = pd.DataFrame({
    'id': ids,
    'Target': final_pred
})
submission['Target'] = target_encoder.inverse_transform(submission['Target'])

submission.to_csv('/kaggle/working/submission.csv', index=False)