In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
train_data = pd.read_csv('/kaggle/input/play-station-plus-subscription-prediction/Dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/play-station-plus-subscription-prediction/Dataset/test.csv')

In [3]:
User_ID = test_data['User_ID'].copy()
test_data = test_data.drop('User_ID', axis=1)
train_data = train_data.drop('User_ID', axis=1)

In [4]:
train_data['Engagement_Score'] = train_data['PS_Plus_Subscription_Duration'] / (train_data['Last_Played_Days_Ago'] + 1)
test_data['Engagement_Score'] = test_data['PS_Plus_Subscription_Duration'] / (test_data['Last_Played_Days_Ago'] + 1)

train_data['Value_Perception'] = train_data['Average_Monthly_Spending'] / (train_data['PS_Plus_Subscription_Duration'] + 1)
test_data['Value_Perception'] = test_data['Average_Monthly_Spending'] / (test_data['PS_Plus_Subscription_Duration'] + 1)

train_data['Satisfaction_Support_Ratio'] = train_data['User_Satisfaction_Rating'].fillna(train_data['User_Satisfaction_Rating'].median()) / (train_data['Support_Requests'] + 1)
test_data['Satisfaction_Support_Ratio'] = test_data['User_Satisfaction_Rating'].fillna(test_data['User_Satisfaction_Rating'].median()) / (test_data['Support_Requests'] + 1)

In [5]:
train_data = train_data.drop(columns=['User_Age', 'User_Gender', 'User_Region'])
test_data = test_data.drop(columns=['User_Age', 'User_Gender', 'User_Region'])

In [6]:
X = train_data.drop(columns=['Subscription_Canceled'])
Y = train_data['Subscription_Canceled']

In [7]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.2, random_state=5, stratify=Y)

In [8]:
numerical_features = ['PS_Plus_Subscription_Duration', 'Support_Requests', 'Last_Played_Days_Ago',
                       'Average_Monthly_Spending', 'Engagement_Score', 'Value_Perception', 'Satisfaction_Support_Ratio']
categorical_features = ['Payment_Type']

numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_transformer, numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [9]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500, class_weight='balanced', random_state=55))
])

In [10]:
model.fit(X_train, Y_train)

In [11]:
X_test_processed = model.predict(test_data)

submission = pd.DataFrame({'User_ID': User_ID, 'Subscription_Canceled': X_test_processed})
submission.to_csv('submission.csv', index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
