Supervised learning dengan metode regresi logistik untuk memprediksi apakah customer akan membeli produk atau tidak

In [1]:
# Langkah 1: Instalasi Library
# Pastikan scikit-learn, pandas, dan numpy sudah terinstal
# pip install scikit-learn pandas numpy

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error, r2_score

test_data = pd.read_csv('test_customer_data.csv')
train_data = pd.read_csv('labeled_customer_data.csv')

train_data['gender'] = LabelEncoder().fit_transform(train_data['gender'])
train_data['profession'] = LabelEncoder().fit_transform(train_data['profession'])

#new data
test_data['gender'] = LabelEncoder().fit_transform(test_data['gender'])
test_data['profession'] = LabelEncoder().fit_transform(test_data['profession'])

# Langkah 3: Membagi Data menjadi Data Pelatihan dan Data Uji
X = train_data[['gender', 'age', 'spending_score', 'profession', 'work_experience', 'family_size']]  # Fitur (Usia, Pendapatan)
y = train_data['income']  # Label (Beli Produk)

X_new = test_data[['gender', 'age', 'spending_score', 'profession', 'work_experience', 'family_size']]

# Membagi data (80% pelatihan, 20% uji)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_new_scaled = scaler.transform(X_new)

# Bangun model regresi linear
model = LinearRegression()
model.fit(X_train, y_train)

# Menggunakan Model untuk Prediksi
y_pred = model.predict(X_test)

Evaluasi Model

In [2]:
# Evaluasi model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R2 Score:", r2)

Mean Squared Error (MSE): 2094402915.6994472
Root Mean Squared Error (RMSE): 45764.647007263666
R2 Score: 0.03689652698541357


Prediksi data baru

In [3]:
# Prediksi annual income untuk data baru
predicted_income = model.predict(X_new_scaled)

Hasil

In [4]:
# Tampilkan hasil prediksi
test_data['Predicted Annual Income'] = predicted_income
print("\nHasil Prediksi untuk Data Baru:")
print(test_data)


Hasil Prediksi untuk Data Baru:
     CustomerID  gender  age  income  spending_score  profession  \
0          1601       0   24   51661              79           3   
1          1602       0   71  152910              90           0   
2          1603       1    1  114556              64           3   
3          1604       1    0  143455              62           5   
4          1605       1   30  173096              92           5   
..          ...     ...  ...     ...             ...         ...   
395        1996       0   71  184387              40           0   
396        1997       0   91   73158              32           1   
397        1998       1   87   90961              14           5   
398        1999       1   77  182109               4           4   
399        2000       1   90  110610              52           3   

     work_experience  family_size  Predicted Annual Income  
0                  0            7            112206.073133  
1                  1        