In [1]:
file_dir = '데이콘 법원 판결'

In [2]:
import pandas as pd

train_df = pd.read_csv(file_dir + '/train.csv')
test_df = pd.read_csv(file_dir + '/test.csv')

In [3]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X


In [4]:
X = get_vector(vectorizer, train_df, True)
y = train_df[["first_party_winner"]]
test_X = get_vector(vectorizer, test_df, False)

In [5]:
X = np.squeeze(np.asarray(X))
test_X = np.squeeze(np.asarray(test_X))

In [6]:
cat_params = {
                'verbose' : 100,
                'random_state': 113,
               # 'use_best_model' : True,
             }

In [7]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier


# Classifiers
names = [
    "Logistic Regression",
    "KNN Classifier",
    "Decision Tree",
    "Linear SVM",
    "Random Forest",
    "SGD Classifier",
    "Ridge Classifier",
    "XGBoost",
    "AdaBoost",
    "Catboost",
]

models = [
    LogisticRegression(max_iter = 1000),
    KNeighborsClassifier(n_neighbors = 149, n_jobs = -1),
    DecisionTreeClassifier(),
    svm.SVC(kernel = 'linear'),
    RandomForestClassifier(n_estimators = 100),
    SGDClassifier(loss = 'hinge'),
    RidgeClassifier(),
    XGBClassifier(),
    AdaBoostClassifier(),
    CatBoostClassifier(**cat_params)
]


In [9]:
import time
# Function to return summary of baseline models
def score(X_train, y_train, X_val, y_val, names = names, models = models):
    score_df, score_train, score_val = pd.DataFrame(), [], []
    x = time.time()
    for model in models:
        model.fit(X_train, y_train)
        y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
        score_train.append(accuracy_score(y_train, y_train_pred))
        score_val.append(accuracy_score(y_val, y_val_pred))
    
    score_df["Classifier"], score_df["Training accuracy"], score_df["Validation accuracy"] = names, score_train, score_val
    score_df.sort_values(by = 'Validation accuracy', ascending = False, inplace = True)
    return score_df


In [10]:
score(X_train, y_train, X_test, y_test, names = names, models = models)

  y = column_or_1d(y, warn=True)
  return self._fit(X, y)
  y = column_or_1d(y, warn=True)
  model.fit(X_train, y_train)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Learning rate set to 0.013797
0:	learn: 0.6914300	total: 127ms	remaining: 2m 6s
100:	learn: 0.6120417	total: 3.9s	remaining: 34.7s
200:	learn: 0.5809726	total: 7.42s	remaining: 29.5s
300:	learn: 0.5578573	total: 11.1s	remaining: 25.8s
400:	learn: 0.5374842	total: 14.6s	remaining: 21.9s
500:	learn: 0.5150843	total: 18.2s	remaining: 18.1s
600:	learn: 0.4915849	total: 21.7s	remaining: 14.4s
700:	learn: 0.4674999	total: 25.4s	remaining: 10.8s
800:	learn: 0.4410493	total: 28.9s	remaining: 7.18s
900:	learn: 0.4153429	total: 32.8s	remaining: 3.6s
999:	learn: 0.3939691	total: 36.3s	remaining: 0us


Unnamed: 0,Classifier,Training accuracy,Validation accuracy
4,Random Forest,1.0,0.655242
0,Logistic Regression,0.937941,0.653226
1,KNN Classifier,0.667508,0.65121
9,Catboost,0.869324,0.643145
6,Ridge Classifier,0.999495,0.612903
7,XGBoost,0.998991,0.610887
8,AdaBoost,0.745207,0.596774
3,Linear SVM,0.996973,0.592742
5,SGD Classifier,0.999495,0.58871
2,Decision Tree,1.0,0.572581


In [11]:
model = KNeighborsClassifier(n_neighbors = 149, n_jobs = -1)
model.fit(X, y)

  return self._fit(X, y)


In [12]:
submit = pd.read_csv(file_dir + '/sample_submission.csv')

In [13]:
pred = model.predict(test_X)

In [15]:
submit['first_party_winner'] = pred
submit.to_csv(file_dir + '/23.06.07 KNN_submit.csv', index=False)
submit.head(10)

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1
5,TEST_0005,1
6,TEST_0006,1
7,TEST_0007,1
8,TEST_0008,1
9,TEST_0009,1
