In [3]:
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn2pmml import PMMLPipeline
from sklearn2pmml import sklearn2pmml
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import svm
from sklearn.neural_network import MLPClassifier

In [4]:
os.environ["MODEL"] = "rf"
os.environ["DATASET"] = "UPB2015"

In [6]:
df = pd.read_csv(f"dataset/{os.environ.get('DATASET')}/useful_messages.csv")
positive_df = df.loc[df["usefulTransfer"] == 1]
negative_df = df.loc[df["usefulTransfer"] == 0]

lesser_len = positive_df.shape[0] if positive_df.shape[0] < negative_df.shape[0] else negative_df.shape[0]

positive_df = positive_df.sample(lesser_len)
negative_df = negative_df.sample(lesser_len)

balanced_df = pd.concat([positive_df, negative_df], ignore_index=True)

In [7]:
balanced_df


Unnamed: 0,messageHopCount,oldFriendWithDestination,oldRelayBattery,oldCommonCommunity,oldDataMemory,newFriendWithDestination,newRelayBattery,newCommonCommunity,newDataMemory,usefulTransfer
0,1,0,0.730968,1,0.022,1,0.240536,0,0.025,1.0
1,1,0,0.597545,1,0.025,0,0.333218,0,0.028,1.0
2,1,0,0.597545,1,0.018,1,0.240536,0,0.017,1.0
3,1,0,0.730968,1,0.015,1,0.240536,0,0.013,1.0
4,1,0,0.730968,1,0.018,1,0.240536,0,0.014,1.0
...,...,...,...,...,...,...,...,...,...,...
2887,0,1,0.240536,0,0.023,1,0.730968,0,0.029,0.0
2888,1,1,0.730968,0,0.021,0,0.550437,0,0.023,0.0
2889,4,1,0.240536,0,0.026,1,0.550437,0,0.027,0.0
2890,2,1,0.637417,0,0.024,1,0.730968,0,0.019,0.0


In [8]:
preprocessed_df = balanced_df.drop(columns="usefulTransfer").copy()

minmax_columns = ["messageHopCount"]
categorial_columns = [
    "oldFriendWithDestination",
    "oldCommonCommunity",
    "newFriendWithDestination",
    "newCommonCommunity",
]

standard_columns = [
    col
    for col in preprocessed_df.select_dtypes(include=["float64", "int64"]).columns
    if col not in minmax_columns and col not in categorial_columns
]

preprocessor = ColumnTransformer(
    transformers=[
        ("minmax", MinMaxScaler((0, 1)), minmax_columns),
        ("standard", StandardScaler(), standard_columns),
        ("onehotencoder", OneHotEncoder(), categorial_columns),
    ],
    remainder="passthrough",
)

preprocessed_df = preprocessor.fit_transform(preprocessed_df)
X = balanced_df.copy().drop(columns=["usefulTransfer"])
result_df = pd.DataFrame(balanced_df["usefulTransfer"].copy().squeeze())
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(result_df)


  y = column_or_1d(y, warn=True)


In [10]:
def display_metrics(y_test, y_pred, save=False):
    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    display_func = print

    if save:
        f = open("metrics.txt", "w")
        display_func = f.write

    display_func(f"Accuracy: {accuracy:.2f}\n")
    display_func("Classification Report:\n")
    display_func(str(classification_report(y_test, y_pred)) + '\n')
    display_func("Confusion Matrix:\n")
    display_func(str(confusion_matrix(y_test, y_pred)) + '\n')

    if save:
        f.close()

In [6]:

def train_neural(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    clf = MLPClassifier(
        solver="lbfgs",
        activation="relu",
        alpha=1e-5,
        hidden_layer_sizes=(64, 32),
        random_state=42,
        max_iter=500,
    )

    # Build the pipeline
    neural_pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", clf)])

    neural_pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = neural_pipeline.predict(X_test)

    # Create a PMML pipeline
    pmml_pipeline = PMMLPipeline([("preprocessor", preprocessor), ("classifier", clf)])

    base_working_dir = os.getcwd()
    os.chdir(f"{base_working_dir}/dataset/{os.environ.get('DATASET')}")
    pmml_pipeline.fit(X_train, y_train)

    # Export the model to PMML
    sklearn2pmml(pmml_pipeline, f"model-rf-{os.environ.get('DATASET')}.pmml")
    display_metrics(y_test, y_pred, save=True)

    os.chdir(base_working_dir)


In [7]:

def train_svm(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    base = svm.SVC(kernel="rbf")

    # Hyperparameter tuning using Grid Search
    param_grid = {
        "C": [0.1, 1, 10],
        "gamma": ["scale", "auto"],  # Kernel coefficient
    }

    grid_poly = GridSearchCV(base, param_grid, refit=True, cv=5)
    grid_poly.fit(X_train, y_train)

    best_svm = grid_poly.best_estimator_

    # Build the pipeline
    svm_pipeline = Pipeline([("preprocessor", preprocessor), ("classifier", best_svm)])

    svm_pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = svm_pipeline.predict(X_test)

    # Create a PMML pipeline
    pmml_pipeline = PMMLPipeline(
        [("preprocessor", preprocessor), ("classifier", best_svm)]
    )

    base_working_dir = os.getcwd()
    os.chdir(f"{base_working_dir}/dataset/{os.environ.get('DATASET')}")
    pmml_pipeline.fit(X_train, y_train)

    # Export the model to PMML
    sklearn2pmml(pmml_pipeline, f"model-rf-{os.environ.get('DATASET')}.pmml")
    display_metrics(y_test, y_pred, save=True)

    os.chdir(base_working_dir)

In [11]:

def train_random_forest(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

    # Build the pipeline
    rf_pipeline = Pipeline(
        [("preprocessor", preprocessor), ("classifier", rf_classifier)]
    )

    rf_pipeline.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = rf_pipeline.predict(X_test)

    # Create a PMML pipeline
    pmml_pipeline = PMMLPipeline(
        [("preprocessor", preprocessor), ("classifier", rf_classifier)]
    )

    base_working_dir = os.getcwd()
    os.chdir(f"{base_working_dir}/dataset/{os.environ.get('DATASET')}")
    pmml_pipeline.fit(X_train, y_train)

    # Export the model to PMML
    sklearn2pmml(pmml_pipeline, f"model-rf-{os.environ.get('DATASET')}.pmml")
    display_metrics(y_test, y_pred, save=True)

    os.chdir(base_working_dir)


In [12]:
if os.environ.get("MODEL") == "rf":
    train_random_forest(X, y)
elif os.environ.get("MODEL") == "neural":
    train_neural(X, y)
elif os.environ.get("MODEL") == "svm":
    train_svm(X, y)

