# TRAIN DENGAN TARGET

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

In [2]:
test_df = pd.read_csv('dataset_findIt/test.csv')
train_df = pd.read_csv('dataset_findIt/train.csv')
target_df = pd.read_csv('dataset_findIt/target.csv')

In [3]:
train_df

Unnamed: 0,developerCountry,countryCode,userRatingCount,primaryGenreName,downloads,deviceType,hasPrivacyLink,hasTermsOfServiceLink,hasTermsOfServiceLinkRating,isCorporateEmailScore,adSpent,appAge,averageUserRating,appContentBrandSafetyRating,appDescriptionBrandSafetyRating,mfaRating
0,NORWAY,RO,127731,Sports,,smartphone,True,True,low,99.0,14.017220,160.400000,4.0,medium,low,low
1,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Medical,50 - 100,GLOBAL,True,,,99.0,,17.500000,0.0,,low,low
2,UNITED ARAB EMIRATES,CZ,51143,Games,50000000 - 100000000,GLOBAL,True,True,low,0.0,31.883163,30.766667,4.0,,low,low
3,GERMANY,GLOBAL,1074,Games,,GLOBAL,True,,,99.0,,71.533333,4.0,,low,low
4,CANNOT IDENTIFY COUNTRY,GLOBAL,17,Tools,1000 - 5000,GLOBAL,True,,,99.0,,52.400000,4.0,,low,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Utilities,,GLOBAL,True,,,99.0,,26.266667,0.0,,low,low
6996,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Business,,GLOBAL,True,,,,,23.800000,0.0,,low,low
6997,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Personalization,10 - 50,GLOBAL,True,,,0.0,,27.500000,,,medium,low
6998,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Business,10 - 50,GLOBAL,True,False,high,99.0,,124.033333,0.0,,low,low


In [4]:
test_df

Unnamed: 0,ID,developerCountry,countryCode,userRatingCount,primaryGenreName,downloads,deviceType,hasPrivacyLink,hasTermsOfServiceLink,hasTermsOfServiceLinkRating,isCorporateEmailScore,adSpent,appAge,averageUserRating,appContentBrandSafetyRating,appDescriptionBrandSafetyRating,mfaRating
0,2807,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,2,Medical,,GLOBAL,True,,,,,81.600000,5.0,,low,low
1,1742,ICELAND,EMEA,0,Games,5000 - 10000,GLOBAL,True,True,low,0.0,0.027742,24.700000,0.0,,low,low
2,806,UNITED STATES,RU,13059,Games,,smartphone,True,False,high,99.0,9.249056,40.300000,4.0,,low,low
3,2635,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Games,,GLOBAL,True,False,high,,,39.233333,0.0,,medium,low
4,9047,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Shopping,,GLOBAL,,,,,,65.533333,0.0,,low,low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,2288,UNITED STATES,GLOBAL,0,Productivity,,GLOBAL,True,True,low,,,94.100000,0.0,,low,low
2996,5541,CANNOT IDENTIFY COUNTRY,GLOBAL,10,Business,100 - 500,GLOBAL,True,,,99.0,,25.033333,5.0,,low,low
2997,9259,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,2,Finance,,GLOBAL,,,,,,76.000000,2.0,,low,low
2998,3477,ADDRESS NOT LISTED IN PLAYSTORE,GLOBAL,0,Business,1 - 5,GLOBAL,True,,,99.0,,100.633333,0.0,,low,low


In [5]:
target_df

Unnamed: 0,coppaRisk
0,False
1,False
2,False
3,False
4,False
...,...
6995,False
6996,False
6997,False
6998,False


In [6]:
print("Train :")
print(train_df.shape)
print("\n")
print("Test:")
print(test_df.shape)
print("\n")
print("Target:")
print(target_df.shape)

Train :
(7000, 16)


Test:
(3000, 17)


Target:
(7000, 1)


In [7]:
# Menggabungkan fitur dan target
df = train_df.copy()
df['coppaRisk'] = target_df['coppaRisk']

In [8]:
# Memisahkan fitur dengan target
x = df.drop(columns=['coppaRisk'])
y= df['coppaRisk']

- Fitur : variable dependent --> Hasil yang ingin diprediksi
- Target: variable independent

In [9]:
# Mencari tahu banyaknya missing values
missing_values = df.isnull().sum().sort_values(ascending=False)
missing_values[missing_values > 0]

appContentBrandSafetyRating    6162
adSpent                        5679
hasTermsOfServiceLinkRating    4635
hasTermsOfServiceLink          4635
downloads                      2149
averageUserRating              1232
isCorporateEmailScore          1128
hasPrivacyLink                  750
countryCode                      64
appAge                           50
dtype: int64

Sebelum menangani missing values, disini saya akan split datanya terlebih dahulu dikarenakan untuk mencegah data leakage.

In [10]:
# Melakukan split data untuk evaluasi di awal
x_train, x_val, y_train, y_val = train_test_split(x,y, test_size=0.2, random_state=42)

Dikarenakan banyaknya data yang hilang, maka tidak mungkin saya melakukan drop data begitu saja. Maka dari itu saya akan mengganti nilai NaN pada kategori numerik dengan data mean, sedangkan untuk kategori kategorikal diganti dengan most frequent

In [13]:
# Mengidentifikasi kolom numerik dan kategorik
num_col = x.select_dtypes(include=['int64','float64']).columns.tolist()
categorical_col = x.select_dtypes(include=['object', 'bool']).columns.tolist()

# Menangani missing values
num_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

x_train[num_col] = num_imputer.fit_transform(x_train[num_col])
x_val[num_col] = num_imputer.transform(x_val[num_col])

x_train[categorical_col] = categorical_imputer.fit_transform(x_train[categorical_col])
x_val[categorical_col] = categorical_imputer.transform(x_val[categorical_col])

In [None]:
# # Encode per kolom pada train dan val
# encoders = {}
# for col in categorical_col:
#     le = LabelEncoder()
#     le.fit(x_train[col].astype(str))
#     x_train[col] = le.transform(x_train[col].astype(str))
#     x_val[col] = le.transform(x_val[col].astype(str))
#     encoders[col] = le

ValueError: y contains previously unseen labels: 'UNITED STATES'

Setelah melihat kode yang diataas, menggunakan label encoder tidak tepat. Maka saya menggunakan One Hot Encoder.

In [16]:
# Membuat pipeline untuk preprocessing
numeric_transformer = SimpleImputer(strategy='mean')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, num_col),
    ('cat', categorical_transformer, categorical_col)
])


Membuat dan Melatih Model

In [17]:
# membuat pipeline dengan model
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [None]:
# latih model
model_pipeline.fit(x_train, y_train)

# prediksi dan evaluasi
y_pred = model_pipeline.predict(x_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9064285714285715


In [20]:
test_pred = model_pipeline.predict(test_df)

In [24]:
format_df = pd.read_csv('dataset_findIt/submission_format.csv')

assert len(test_pred) == len(format_df), "Jumlah prediksi dan format submission tidak sama!"

# Buat DataFrame submission
submission = pd.DataFrame({
    format_df.columns[0]: format_df[format_df.columns[0]],  
    format_df.columns[1]: test_pred                       
})

# Simpan ke csv
submission.to_csv('submission.csv', index=False)
print("✅ File 'submission.csv' berhasil dibuat sesuai format.")


✅ File 'submission.csv' berhasil dibuat sesuai format.
