## SVM with scikit-learn

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import svm
from sklearn.utils import resample

import time

import seaborn as sns
import matplotlib.pyplot as plt


def main():
    df = pd.read_csv("/content/drive/MyDrive/UNLV/train.csv")
    df.drop(["Id"], axis=1, inplace=True)

    df.columns = map(str.lower, df.columns)
    df.rename(columns={"married/single": "married_single"}, inplace=True)

    # --------------------
    # Category cols to num
    cate_cols = ["married_single", "profession", "house_ownership", "car_ownership", "city", "state"]

    for col in cate_cols:
        le = LabelEncoder()
        le = le.fit(df[col])
        df[col] = le.transform(df[col])

    print("Label Encoding-Done.")

    # --------------------
    # Down sampling
    subset_0 = df[df["risk_flag"] == 0]
    subset_1 = df[df["risk_flag"] == 1]

    subset_0_downsampled = resample(subset_0,
                                    replace=False,
                                    n_samples=len(subset_1),
                                    random_state=42)

    df = pd.concat([subset_0_downsampled, subset_1])

    print("Down Sampling-Done.")

    X = df.drop(["risk_flag"], axis=1)
    y = df["risk_flag"].apply(lambda x: int(x))

    # --------------------
    # Data split
    X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
    X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, stratify=y_train_val, random_state=42)

    # --------------------
    # StandardScaler
    scaler = StandardScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)

    print("Scaling-Done.")

    clf = svm.SVC(max_iter=100000) # max_iter: default -1

    # --------------------
    # Start: training
    start_time = time.time()

    clf.fit(X_train_scaled, y_train)

    # End: training
    end_time = time.time()
    # Calculate training time
    elapsed_time = end_time - start_time

    val_pred = clf.predict(X_val_scaled)
    test_pred = clf.predict(X_test_scaled)

    val_auc = roc_auc_score(y_val, val_pred)
    test_auc = roc_auc_score(y_test, test_pred)

    print(f"\nVal AUC = {val_auc}")
    print(f"Test AUC = {test_auc}")
    print(f"Elapsed Time: {elapsed_time // 60} min {elapsed_time % 60:.2f} sec")


if __name__ == "__main__":
    main()

Label Encoding-Done.
Down Sampling-Done.
Scaling-Done.

Val AUC = 0.6444148469059189
Test AUC = 0.6394106749787947
Elapsed Time: 1.0 min 40.02 sec


## SVM with pyspark

In [4]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=5fc9a6a2d07b2ba00bc8a07ae46fefcf918547072c0381c39a6d4589d17c6c92
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [12]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col as scol
from pyspark.sql.functions import rand
import time

def main():
    spark = SparkSession.builder.appName("UNLV").getOrCreate()

    df = spark.read.csv("/content/drive/MyDrive/UNLV/train.csv", header=True, inferSchema=True)
    df = df.drop("Id").withColumnRenamed("married/single", "married_single")
    df = df.toDF(*(col.lower() for col in df.columns))
    df = df.withColumn("risk_flag", scol("risk_flag").cast("integer"))

    # --------------------
    # Category cols to num
    cate_cols = ["married_single", "profession", "house_ownership", "car_ownership", "city", "state"]

    indexers = [StringIndexer(inputCol=col, outputCol=col + "_idx").fit(df) for col in cate_cols]

    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)
    df = df.drop(*cate_cols)

    print("Label Encoding-Done.")

    assembler = VectorAssembler(
        inputCols=[col for col in df.columns if col != "risk_flag"],
        outputCol="origin_features"
    )
    df = assembler.transform(df)
    df = df.select(["origin_features", "risk_flag"])

    # --------------------
    # Down sampling
    pos_count = df.filter("risk_flag = 1").count()
    neg_df = df.filter("risk_flag = 0")
    sampled_neg_df = neg_df.orderBy(rand(seed=42)).limit(pos_count)
    df = sampled_neg_df.union(df.filter("risk_flag = 1"))

    print("Down Sampling-Done.")

    # --------------------
    # Data split
    df_risk_0 = df.filter(scol("risk_flag") == 0)
    df_risk_1 = df.filter(scol("risk_flag") == 1)

    train_val_risk_0, test_risk_0 = df_risk_0.randomSplit([0.8, 0.2], seed=42)
    train_val_risk_1, test_risk_1 = df_risk_1.randomSplit([0.8, 0.2], seed=42)

    train_val = train_val_risk_0.union(train_val_risk_1)
    test = test_risk_0.union(test_risk_1)

    train_risk_0, val_risk_0 = train_val_risk_0.randomSplit([0.8, 0.2], seed=42)
    train_risk_1, val_risk_1 = train_val_risk_1.randomSplit([0.8, 0.2], seed=42)

    train = train_risk_0.union(train_risk_1)
    val = val_risk_0.union(val_risk_1)

    # print(train.groupBy("risk_flag").count().show())
    # print(val.groupBy("risk_flag").count().show())
    # print(test.groupBy("risk_flag").count().show())

    # --------------------
    # StandardScaler
    scaler = StandardScaler(inputCol="origin_features", outputCol="features")
    scaler_model = scaler.fit(train)
    train_scaled = scaler_model.transform(train)
    val_scaled = scaler_model.transform(val)
    test_scaled = scaler_model.transform(test)

    train_scaled = train_scaled.select(["features", "risk_flag"])
    val_scaled = val_scaled.select(["features", "risk_flag"])
    test_scaled = test_scaled.select(["features", "risk_flag"])

    print("Scaling-Done.")

    svm = LinearSVC(labelCol="risk_flag", maxIter=100000)  # maxIter: default 100

    # --------------------
    # Start: training
    start_time = time.time()

    model = svm.fit(train_scaled)

    # End: training
    end_time = time.time()
    # Calculate training time
    elapsed_time = end_time - start_time

    val_preds = model.transform(val_scaled)
    test_preds = model.transform(test_scaled)

    evaluator = BinaryClassificationEvaluator(labelCol="risk_flag", metricName="areaUnderROC")
    val_auc = evaluator.evaluate(val_preds)
    test_auc = evaluator.evaluate(test_preds)

    print(f"\nVal AUC = {val_auc}")
    print(f"Test AUC = {test_auc}")
    print(f"Elapsed Time: {elapsed_time // 60} min {elapsed_time % 60:.2f} sec")


if __name__ == "__main__":
    main()

Label Encoding-Done.
Down Sampling-Done.
Scaling-Done.

Val AUC = 0.5571115341365019
Test AUC = 0.548107129030474
Elapsed Time: 0.0 min 57.80 sec
