In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Load the CSV file
file_path = 'data2.csv'
df = pd.read_csv(file_path)
df = df.dropna()
#df = preprocess_data(df)
df = df.head(40000)

# Preprocess the data
X = df.iloc[:, 2:80]  # Features (exclude f_0, f_1 and label columns)
y1 = df.iloc[:, 80]  # First binary label
y2 = df.iloc[:, 81]  # Second binary label

# One-hot encode categorical features (f_2 to f_32)
X = pd.get_dummies(X, columns=[f'f_{i}' for i in range(2, 33)])

# Split the data into training and testing sets
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

y1 = []
y2 = []
# Train the random forest classifiers
for i in range(70,131):
    rfc1 = RandomForestClassifier(n_estimators=i, random_state=42)
    rfc1.fit(X_train, y1_train)

    rfc2 = RandomForestClassifier(n_estimators=i, random_state=42)
    rfc2.fit(X_train, y2_train)

    # Predict the test data
    y1_pred = rfc1.predict(X_test)
    y2_pred = rfc2.predict(X_test)
    
    
    y1.append(accuracy_score(y1_test, y1_pred))
    y2.append(accuracy_score(y2_test, y2_pred))
    # Evaluate the model
    
    print("Results for first binary label (Column 80):")
    print(confusion_matrix(y1_test, y1_pred))
    print(classification_report(y1_test, y1_pred))
    print("Accuracy:", accuracy_score(y1_test, y1_pred))

    print("\nResults for second binary label (Column 81):")
    print(confusion_matrix(y2_test, y2_pred))
    print(classification_report(y2_test, y2_pred))
    print("Accuracy:", accuracy_score(y2_test, y2_pred))


# Generate the x-axis data (integers from 75 to 125)
x = np.arange(70, 131)

# Create the plot
plt.figure()

# Plot the Y1 and Y2 data
plt.plot(x, y1, label='is_clicked', marker='o')
plt.plot(x, y2, label='is_installed', marker='o')

# Set the axis labels
plt.xlabel('n_estimators')
plt.ylabel('accuracy')

# Set the title
plt.title('Y1 and Y2 vs X')

# Add a legend
plt.legend()
plt.savefig('graph_high_res.png', dpi=800)
# Show the plot
plt.show()

toy_train and toy_test

In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


# Load the CSV file
file_path = 'toy_train.csv'
df = pd.read_csv(file_path)
df = df.dropna()
#df = preprocess_data(df)
df = df.head(30000)

# Preprocess the data
X = df.iloc[:, 2:80]  # Features (exclude f_0, f_1 and label columns)
y1 = df.iloc[:, 80]  # First binary label
y2 = df.iloc[:, 81]  # Second binary label

# One-hot encode categorical features (f_2 to f_32)
X = pd.get_dummies(X, columns=[f'f_{i}' for i in range(2, 33)])

# Split the data into training and testing sets
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

# Train the random forest classifiers
rfc1 = RandomForestClassifier(n_estimators=90, random_state=42)
rfc1.fit(X_train, y1_train)

rfc2 = RandomForestClassifier(n_estimators=90, random_state=42)
rfc2.fit(X_train, y2_train)

# Predict the test data
y1_pred = rfc1.predict(X_test)
y2_pred = rfc2.predict(X_test)

# Evaluate the model
print("Results for first binary label (Column 80):")
print(confusion_matrix(y1_test, y1_pred))
print(classification_report(y1_test, y1_pred))
print("Accuracy:", accuracy_score(y1_test, y1_pred))

print("\nResults for second binary label (Column 81):")
print(confusion_matrix(y2_test, y2_pred))
print(classification_report(y2_test, y2_pred))
print("Accuracy:", accuracy_score(y2_test, y2_pred))


Results for first binary label (Column 80):
[[4267   82]
 [ 852  799]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      4349
           1       0.91      0.48      0.63      1651

    accuracy                           0.84      6000
   macro avg       0.87      0.73      0.77      6000
weighted avg       0.85      0.84      0.83      6000

Accuracy: 0.8443333333333334

Results for second binary label (Column 81):
[[4461   92]
 [ 763  684]]
              precision    recall  f1-score   support

           0       0.85      0.98      0.91      4553
           1       0.88      0.47      0.62      1447

    accuracy                           0.86      6000
   macro avg       0.87      0.73      0.76      6000
weighted avg       0.86      0.86      0.84      6000

Accuracy: 0.8575


In [4]:
# Load the test set
test_file_path = 'toy_test.csv'
test_df = pd.read_csv(test_file_path)

# Preprocess the test data
X_test_new = test_df.iloc[:, 2:80]  # Features (exclude f_0, f_1 and label columns)
y1_test_new = test_df.iloc[:, 80]  # First binary label
y2_test_new = test_df.iloc[:, 81]  # Second binary label

# One-hot encode categorical features (f_2 to f_32)
X_test_new = pd.get_dummies(X_test_new, columns=[f'f_{i}' for i in range(2, 33)])

# Ensure the test set has the same columns as the training set
X_test_new = X_test_new.reindex(columns=X_train.columns, fill_value=0)

# Predict the new test data
y1_pred_new = rfc1.predict(X_test_new)
y2_pred_new = rfc2.predict(X_test_new)

# Evaluate the model on the new test data
print("Results for first binary label (Column 80) on new test set:")
print(confusion_matrix(y1_test_new, y1_pred_new))
print(classification_report(y1_test_new, y1_pred_new))
print("Accuracy:", accuracy_score(y1_test_new, y1_pred_new))

print("\nResults for second binary label (Column 81) on new test set:")
print(confusion_matrix(y2_test_new, y2_pred_new))
print(classification_report(y2_test_new, y2_pred_new))
print("Accuracy:", accuracy_score(y2_test_new, y2_pred_new))

Results for first binary label (Column 80) on new test set:
[[19367   408]
 [ 3976  3862]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90     19775
           1       0.90      0.49      0.64      7838

    accuracy                           0.84     27613
   macro avg       0.87      0.74      0.77     27613
weighted avg       0.85      0.84      0.82     27613

Accuracy: 0.8412342012820049

Results for second binary label (Column 81) on new test set:
[[20644   464]
 [ 3454  3051]]
              precision    recall  f1-score   support

           0       0.86      0.98      0.91     21108
           1       0.87      0.47      0.61      6505

    accuracy                           0.86     27613
   macro avg       0.86      0.72      0.76     27613
weighted avg       0.86      0.86      0.84     27613

Accuracy: 0.8581103103610618


noise_train and noise test

In [6]:
# Load the CSV file
file_path = 'noisy_train.csv'
df = pd.read_csv(file_path)
df = df.dropna()
#df = preprocess_data(df)
df = df.head(40000)

# Preprocess the data
X = df.iloc[:, 2:80]  # Features (exclude f_0, f_1 and label columns)
y1 = df.iloc[:, 80]  # First binary label
y2 = df.iloc[:, 81]  # Second binary label

# One-hot encode categorical features (f_2 to f_32)
X = pd.get_dummies(X, columns=[f'f_{i}' for i in range(2, 33)])

# Split the data into training and testing sets
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

# Train the random forest classifiers
rfc1 = RandomForestClassifier(n_estimators=90, random_state=42)
rfc1.fit(X_train, y1_train)

rfc2 = RandomForestClassifier(n_estimators=90, random_state=42)
rfc2.fit(X_train, y2_train)

# Predict the test data
y1_pred = rfc1.predict(X_test)
y2_pred = rfc2.predict(X_test)

# Evaluate the model
print("Results for first binary label (Column 80):")
print(confusion_matrix(y1_test, y1_pred))
print(classification_report(y1_test, y1_pred))
print("Accuracy:", accuracy_score(y1_test, y1_pred))

print("\nResults for second binary label (Column 81):")
print(confusion_matrix(y2_test, y2_pred))
print(classification_report(y2_test, y2_pred))
print("Accuracy:", accuracy_score(y2_test, y2_pred))


Results for first binary label (Column 80):
[[2843   42]
 [ 656  448]]
              precision    recall  f1-score   support

           0       0.81      0.99      0.89      2885
           1       0.91      0.41      0.56      1104

    accuracy                           0.83      3989
   macro avg       0.86      0.70      0.73      3989
weighted avg       0.84      0.83      0.80      3989

Accuracy: 0.8250188017046879

Results for second binary label (Column 81):
[[2996   42]
 [ 606  345]]
              precision    recall  f1-score   support

           0       0.83      0.99      0.90      3038
           1       0.89      0.36      0.52       951

    accuracy                           0.84      3989
   macro avg       0.86      0.67      0.71      3989
weighted avg       0.85      0.84      0.81      3989

Accuracy: 0.8375532714966157


In [7]:
# Load the CSV file
file_path = 'noisy_test.csv'
df = pd.read_csv(file_path)
df = df.dropna()
#df = preprocess_data(df)
df = df.head(20000)

# Preprocess the data
X = df.iloc[:, 2:80]  # Features (exclude f_0, f_1 and label columns)
y1 = df.iloc[:, 80]  # First binary label
y2 = df.iloc[:, 81]  # Second binary label

# One-hot encode categorical features (f_2 to f_32)
X = pd.get_dummies(X, columns=[f'f_{i}' for i in range(2, 33)])

# Split the data into training and testing sets
X_train, X_test, y1_train, y1_test = train_test_split(X, y1, test_size=0.2, random_state=42)
_, _, y2_train, y2_test = train_test_split(X, y2, test_size=0.2, random_state=42)

# Train the random forest classifiers
rfc1 = RandomForestClassifier(n_estimators=90, random_state=42)
rfc1.fit(X_train, y1_train)

rfc2 = RandomForestClassifier(n_estimators=90, random_state=42)
rfc2.fit(X_train, y2_train)

# Predict the test data
y1_pred = rfc1.predict(X_test)
y2_pred = rfc2.predict(X_test)

# Evaluate the model
print("Results for first binary label (Column 80):")
print(confusion_matrix(y1_test, y1_pred))
print(classification_report(y1_test, y1_pred))
print("Accuracy:", accuracy_score(y1_test, y1_pred))

print("\nResults for second binary label (Column 81):")
print(confusion_matrix(y2_test, y2_pred))
print(classification_report(y2_test, y2_pred))
print("Accuracy:", accuracy_score(y2_test, y2_pred))

Results for first binary label (Column 80):
[[2823   76]
 [ 661  440]]
              precision    recall  f1-score   support

           0       0.81      0.97      0.88      2899
           1       0.85      0.40      0.54      1101

    accuracy                           0.82      4000
   macro avg       0.83      0.69      0.71      4000
weighted avg       0.82      0.82      0.79      4000

Accuracy: 0.81575

Results for second binary label (Column 81):
[[3022   49]
 [ 646  283]]
              precision    recall  f1-score   support

           0       0.82      0.98      0.90      3071
           1       0.85      0.30      0.45       929

    accuracy                           0.83      4000
   macro avg       0.84      0.64      0.67      4000
weighted avg       0.83      0.83      0.79      4000

Accuracy: 0.82625
