## SVM

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn import svm
from sklearn.utils import resample

import time

import seaborn as sns
import matplotlib.pyplot as plt


def main():
    df = pd.read_csv("/content/drive/MyDrive/UNLV/train.csv")
    df.drop(["Id"], axis=1, inplace=True)

    df.columns = map(str.lower, df.columns)
    df.rename(columns={"married/single": "married_single"}, inplace=True)

    # Category cols to num
    cate_cols = ["married_single", "profession", "house_ownership", "car_ownership", "city", "state"]

    for col in cate_cols:
        le = LabelEncoder()
        le = le.fit(df[col])
        df[col] = le.transform(df[col])

    print("Label Encoding-Done.")

    # Down sampling
    subset_0 = df[df["risk_flag"] == 0]
    subset_1 = df[df["risk_flag"] == 1]

    subset_0_downsampled = resample(subset_0,
                                    replace=False,
                                    n_samples=len(subset_1),
                                    random_state=42)

    df = pd.concat([subset_0_downsampled, subset_1])

    print("Down Sampling-Done.")

    X = df.drop(["risk_flag"], axis=1)
    y = df["risk_flag"].apply(lambda x: int(x))

    # StandardScaler
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    print("Scaling-Done.")

    X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

    clf = svm.SVC() # max_iter: default -1

    # Start: training
    start_time = time.time()

    clf.fit(X_train, y_train)

    # End: training
    end_time = time.time()
    # Calculate training time
    elapsed_time = end_time - start_time

    y_pred = clf.predict(X_val)

    auc = roc_auc_score(y_val, y_pred)

    print(f"AUC = {auc:.3f}")
    print(f"Elapsed Time: {elapsed_time // 60} min {elapsed_time % 60:.2f} sec")


if __name__ == "__main__":
    main()

Label Encoding-Done.
Down Sampling-Done.
Scaling-Done.
AUC = 0.641
Elapsed Time: 2.0 min 48.30 sec


## SVM with Spark

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285398 sha256=0fbe34031425abcb4737e9c453aebefa3a4fc0b28430d594edb334de3df0aa2b
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [21]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler, StringIndexer
from pyspark.ml import Pipeline
from pyspark.sql.functions import col as scol
from pyspark.sql.functions import rand

import time


def main():
    spark = SparkSession.builder.appName("UNLV").getOrCreate()

    df = spark.read.csv("/content/drive/MyDrive/UNLV/train.csv", header=True, inferSchema=True)
    df = df.drop("Id").withColumnRenamed("married/single", "married_single")
    df = df.toDF(*(col.lower() for col in df.columns))
    df = df.withColumn("risk_flag", scol("risk_flag").cast("integer"))

    # Category cols to num
    cate_cols = ["married_single", "profession", "house_ownership", "car_ownership", "city", "state"]

    indexers = [StringIndexer(inputCol=col, outputCol=col + "_idx").fit(df) for col in cate_cols]

    pipeline = Pipeline(stages=indexers)
    df = pipeline.fit(df).transform(df)
    df = df.drop(*cate_cols)

    print("Label Encoding-Done.")

    assembler = VectorAssembler(
        inputCols=[col for col in df.columns if col != "risk_flag"],
        outputCol="origin_features"
    )
    df = assembler.transform(df)
    df = df.select(["origin_features", "risk_flag"])

    # Down sampling
    pos_count = df.filter("risk_flag = 1").count()
    neg_df = df.filter("risk_flag = 0")
    sampled_neg_df = neg_df.orderBy(rand(seed=42)).limit(pos_count)
    df = sampled_neg_df.union(df.filter("risk_flag = 1"))

    print("Down Sampling-Done.")

    # StandardScaler
    scaler = StandardScaler(inputCol="origin_features", outputCol="features")
    scaler_model = scaler.fit(df)
    df_scaled = scaler_model.transform(df)

    df = df_scaled.select(["features", "risk_flag"])

    print("Scaling-Done.")

    # Define model
    train, val = df.randomSplit([0.8, 0.2], seed=42)

    svm = LinearSVC(labelCol="risk_flag", weightCol="risk_flag") # maxIter: default 100

    # Start: training
    start_time = time.time()

    model = svm.fit(train)

    # End: training
    end_time = time.time()
    # Calculate training time
    elapsed_time = end_time - start_time

    predictions = model.transform(val)

    evaluator = BinaryClassificationEvaluator(labelCol="risk_flag", metricName="areaUnderROC")
    auc = evaluator.evaluate(predictions)

    print(f"AUC = {auc:.3f}")
    print(f"Elapsed Time: {elapsed_time // 60} min {elapsed_time % 60:.2f} sec")


if __name__ == "__main__":
    main()

Label Encoding-Done.
Down Sampling-Done.
Scaling-Done.
AUC = 0.503
Elapsed Time: 0.0 min 15.18 sec
