In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('sample_submission.csv')

# FEATRUE ENGINEERING

In [2]:
data['first_party'].value_counts()

United States       154
Illinois              9
Maryland              8
Florida               8
New York              7
                   ... 
David Carpenter       1
Larry Gene Heath      1
PGA TOUR, Inc.        1
PPL Montana, LLC      1
Markman               1
Name: first_party, Length: 2110, dtype: int64

In [3]:
data['second_party'].value_counts()

United States                        240
California                            19
United States of America              15
Illinois                              13
Federal Communications Commission     10
                                    ... 
David Boren, Governor of Oklahoma      1
Federal Bureau of Prisons et al.       1
Town of Harrison                       1
Charles Burr et al.                    1
Westview Instruments, Inc.             1
Name: second_party, Length: 1974, dtype: int64

In [4]:
def term_frequency(df):
    tf1 = (df['facts'][1:2]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
    tf1.columns = ['words','tf']
    return tf1.head()

In [5]:
data['facts']

0       On June 27, 1962, Phil St. Amant, a candidate ...
1       Ramon Nelson was riding his bike when he suffe...
2       An Alabama state court convicted Billy Joe Mag...
3       Victor Linkletter was convicted in state court...
4       On April 24, 1953 in Selma, Alabama, an intrud...
                              ...                        
2473    Congress amended the Clean Air Act through the...
2474    Alliance Bond Fund, Inc., an investment fund, ...
2475    In 1992, the District Court sentenced Manuel D...
2476    On March 8, 1996, Enrico St. Cyr, a lawful per...
2477    Herbert Markman owns the patent to a system th...
Name: facts, Length: 2478, dtype: object

In [6]:
data['facts'][1:2].apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()

Unnamed: 0,index,0
0,the,12
1,a,8
2,was,8
3,and,8
4,of,7
...,...,...
117,Owens’,1
118,"distribution,",1
119,potentially,1
120,crack,1


In [7]:
# 출력 옵션 설정
# pd.set_option("display.max_rows", None)  # 모든 행 표시
pd.set_option("display.max_columns", None)  # 모든 열 표시
pd.set_option("display.width", None)  # 줄 바꿈 없이 전체 내용 표시
data['facts']

0       On June 27, 1962, Phil St. Amant, a candidate ...
1       Ramon Nelson was riding his bike when he suffe...
2       An Alabama state court convicted Billy Joe Mag...
3       Victor Linkletter was convicted in state court...
4       On April 24, 1953 in Selma, Alabama, an intrud...
                              ...                        
2473    Congress amended the Clean Air Act through the...
2474    Alliance Bond Fund, Inc., an investment fund, ...
2475    In 1992, the District Court sentenced Manuel D...
2476    On March 8, 1996, Enrico St. Cyr, a lawful per...
2477    Herbert Markman owns the patent to a system th...
Name: facts, Length: 2478, dtype: object

In [None]:
data.iloc[1]['facts']

In [None]:
data.iloc[0]['facts']


In [None]:
data['first_party_winner'].value_counts()

In [8]:
from sklearn.utils import resample

subset_0 = data[data["first_party_winner"] == 0]
subset_1 = data[data["first_party_winner"] == 1]

subset_1_downsampled = resample(subset_1,
                                replace=False,
                                n_samples=800,
                                random_state=42)

data = pd.concat([subset_0, subset_1_downsampled])

In [9]:
data['first_party_winner'].value_counts()

0    829
1    800
Name: first_party_winner, dtype: int64

In [10]:
# transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers.tokenization_utils import TruncationStrategy

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


porter = PorterStemmer()

def stem_text_func(text):
    token_words=word_tokenize(text)
    stem_sentence = []
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

[nltk_data] Downloading package punkt to /home/chunbae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/chunbae/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/chunbae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
!pip install stop_words

In [12]:
from stop_words import get_stop_words
# 불용어 리스트
languages = [
   'English'
]
my_stop_word_list = []
for i in languages:
    my_stop_word_list.append(get_stop_words(i.lower()))
my_stop_word_list = sum(my_stop_word_list, [])

In [13]:
# https://gist.github.com/nealrs/96342d8231b75cf4bb82
contractions = {
  "ain't": "am not",
  "aren't": "are not",
  "can't": "cannot",
  "can't've": "cannot have",
  "'cause": "because",
  "could've": "could have",
  "couldn't": "could not",
  "couldn't've": "could not have",
  "didn't": "did not",
  "doesn't": "does not",
  "don't": "do not",
  "hadn't": "had not",
  "hadn't've": "had not have",
  "hasn't": "has not",
  "haven't": "have not",
  "he'd": "he would",
  "he'd've": "he would have",
  "he'll": "he will",
  "he'll've": "he will have",
  "he's": "he is",
  "how'd": "how did",
  "how'd'y": "how do you",
  "how'll": "how will",
  "how's": "how is",
  "I'd": "I would",
  "I'd've": "I would have",
  "I'll": "I will",
  "I'll've": "I will have",
  "I'm": "I am",
  "I've": "I have",
  "isn't": "is not",
  "it'd": "it had",
  "it'd've": "it would have",
  "it'll": "it will",
  "it'll've": "it will have",
  "it's": "it is",
  "let's": "let us",
  "ma'am": "madam",
  "mayn't": "may not",
  "might've": "might have",
  "mightn't": "might not",
  "mightn't've": "might not have",
  "must've": "must have",
  "mustn't": "must not",
  "mustn't've": "must not have",
  "needn't": "need not",
  "needn't've": "need not have",
  "o'clock": "of the clock",
  "oughtn't": "ought not",
  "oughtn't've": "ought not have",
  "shan't": "shall not",
  "sha'n't": "shall not",
  "shan't've": "shall not have",
  "she'd": "she would",
  "she'd've": "she would have",
  "she'll": "she will",
  "she'll've": "she will have",
  "she's": "she is",
  "should've": "should have",
  "shouldn't": "should not",
  "shouldn't've": "should not have",
  "so've": "so have",
  "so's": "so is",
  "that'd": "that would",
  "that'd've": "that would have",
  "that's": "that is",
  "there'd": "there had",
  "there'd've": "there would have",
  "there's": "there is",
  "they'd": "they would",
  "they'd've": "they would have",
  "they'll": "they will",
  "they'll've": "they will have",
  "they're": "they are",
  "they've": "they have",
  "to've": "to have",
  "wasn't": "was not",
  "we'd": "we had",
  "we'd've": "we would have",
  "we'll": "we will",
  "we'll've": "we will have",
  "we're": "we are",
  "we've": "we have",
  "weren't": "were not",
  "what'll": "what will",
  "what'll've": "what will have",
  "what're": "what are",
  "what's": "what is",
  "what've": "what have",
  "when's": "when is",
  "when've": "when have",
  "where'd": "where did",
  "where's": "where is",
  "where've": "where have",
  "who'll": "who will",
  "who'll've": "who will have",
  "who's": "who is",
  "who've": "who have",
  "why's": "why is",
  "why've": "why have",
  "will've": "will have",
  "won't": "will not",
  "won't've": "will not have",
  "would've": "would have",
  "wouldn't": "would not",
  "wouldn't've": "would not have",
  "y'all": "you all",
  "y'alls": "you alls",
  "y'all'd": "you all would",
  "y'all'd've": "you all would have",
  "y'all're": "you all are",
  "y'all've": "you all have",
  "you'd": "you had",
  "you'd've": "you would have",
  "you'll": "you you will",
  "you'll've": "you you will have",
  "you're": "you are",
  "you've": "you have"
}


In [14]:
import re

#main 단어만 추출 > 차후 이 단어들로 test
def preprocessing_sentence(sentence, remove_stopwords = True):
    #소문자화
    sentence = sentence.lower()
    # /n 제거
    sentence = re.sub(r'/n',' ',sentence)
    #괄호로 닫친 문자열 괄호 제거
    sentence = re.sub(r'\([^)]*\)',' ',sentence)
    #쌍따옴표 제거
    sentence = re.sub('"', ' ', sentence) 
    #약어 정규화
    sentence = ' '.join([contractions[t] if t in contractions else t for t in sentence.split(" ")])
    #소유격 제거
    sentence = re.sub(r"'s\b"," ",sentence)
    #특수문자 제거
    sentence = re.sub("[^a-zA-Z]"," ", sentence)
    
    #불용어 제거
    if my_stop_word_list:
        tokens = ' '.join(word for word in sentence.split() if not word in my_stop_word_list if len(word)>1)
    else:
        tokens = ' '.join(word for word in sentence.split() if len(word)>1)
    return tokens


In [15]:
data['facts'] = [preprocessing_sentence(i) for i in data['facts']]
test['facts'] = [preprocessing_sentence(i) for i in test['facts']]

In [None]:
data['facts'] = data['facts'].astype('str')
test['facts'] = test['facts'].astype('str')

In [None]:
facts_corpus_lose = ' '.join(data[data['first_party_winner'] == 0]['facts'])
facts_corpus_win = ' '.join(data[data['first_party_winner'] == 1]['facts'])

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(facts_corpus_lose)
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

In [None]:
from wordcloud import WordCloud
from wordcloud import STOPWORDS
import matplotlib.pyplot as plt

title_wordcloud = WordCloud(stopwords=STOPWORDS, background_color='white', height=2000, width=4000).generate(facts_corpus_win)
plt.figure(figsize=(16,8))
plt.imshow(title_wordcloud)
plt.axis('off')
plt.show()

# model train

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])
    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])
    
    X = np.concatenate([X_party1.todense(), X_party2.todense(), X_facts.todense()], axis=1)
    return X

In [17]:
X = get_vector(vectorizer, data, True)
Y = data["first_party_winner"]
X_T = get_vector(vectorizer, test, False)

In [18]:
X = np.squeeze(np.asarray(X))
X_T = np.squeeze(np.asarray(X_T))

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1))

scaler.fit(X)
X_scaled = scaler.transform(X)
X_T_scaled = scaler.transform(X_T)

In [20]:
from sklearn.model_selection import GridSearchCV
from sklearn import svm 
from sklearn.linear_model import LogisticRegression

In [21]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=42)

In [22]:
cat_params = {
                'verbose' : 0,
                'random_state': 113,
               # 'use_best_model' : True,
             }

In [23]:
from sklearn import svm
# Classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import RidgeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier


# Classifiers
names = [
    "Logistic Regression",
    "KNN Classifier",
    "Decision Tree",
    "LinearSVC",
    "Linear SVM",
    "Poly SVM",
    "RBF SVM",
    "sigmoid SVM",
    "Neural Network",
    "Random Forest",
    "SGD Classifier",
    "Ridge Classifier",
    "XGBoost",
    "AdaBoost",
    "Catboost",
    "Gaussian Naive Bayes"
]

models = [
    LogisticRegression(max_iter = 1000),
    KNeighborsClassifier(n_neighbors = 149, n_jobs = -1),
    DecisionTreeClassifier(),
    LinearSVC(random_state=113),
    svm.SVC(kernel = 'linear'),
    svm.SVC(kernel = 'poly'),
    svm.SVC(kernel = 'rbf'),
    svm.SVC(kernel = 'sigmoid'),
    MLPClassifier(),
    RandomForestClassifier(n_estimators = 100),
    SGDClassifier(loss = 'hinge'),
    RidgeClassifier(),
    XGBClassifier(),
    AdaBoostClassifier(),
    CatBoostClassifier(**cat_params),
    GaussianNB()
]

In [24]:
import time
# Function to return summary of baseline models
def score(X_train, y_train, X_val, y_val, names = names, models = models):
    score_df, score_train, score_val = pd.DataFrame(), [], []
    x = time.time()
    for model in models:
        model.fit(X_train, y_train)
        y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
        score_train.append(accuracy_score(y_train, y_train_pred))
        score_val.append(accuracy_score(y_val, y_val_pred))
    
    score_df["Classifier"], score_df["Training accuracy"], score_df["Validation accuracy"] = names, score_train, score_val
    score_df.sort_values(by = 'Validation accuracy', ascending = False, inplace = True)
    return score_df

In [25]:
score(X_train, y_train, X_test, y_test, names = names, models = models)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Unnamed: 0,Classifier,Training accuracy,Validation accuracy
13,AdaBoost,0.746738,0.592025
1,KNN Classifier,0.552571,0.588957
0,Logistic Regression,1.0,0.579755
8,Neural Network,1.0,0.576687
7,sigmoid SVM,0.929394,0.567485
3,LinearSVC,1.0,0.56135
4,Linear SVM,1.0,0.56135
9,Random Forest,1.0,0.558282
11,Ridge Classifier,1.0,0.558282
6,RBF SVM,1.0,0.555215


In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier

GRID = [
    {
     'estimator': [MLPClassifier(random_state=113)],
     'estimator__solver': ['adam'],
     'estimator__learning_rate_init': [0.0001],
     'estimator__max_iter': [300],
     'estimator__hidden_layer_sizes': [(500, 400, 300, 200, 100), (400, 400, 400, 400, 400), (300, 300, 300, 300, 300), (200, 200, 200, 200, 200)],
     'estimator__activation': ['logistic', 'tanh', 'relu'],
     'estimator__alpha': [0.0001, 0.001, 0.005],
     'estimator__early_stopping': [True, False]
     }
]

PIPELINE = Pipeline([('estimator', MLPClassifier())])


In [None]:
grid_search = GridSearchCV(estimator=PIPELINE, param_grid=GRID, 
                            scoring= "accuracy",
                            n_jobs=-1, cv=5, refit=True, verbose=1, 
                            return_train_score=False)

grid_search.fit(X, Y)


In [None]:
print(f"최적의 파라미터 : {grid_search.best_params_}")
print(f"최적의 파라미터로 모델의 정확도 : {grid_search.best_score_}")

In [27]:
from sklearn.ensemble import VotingClassifier

#classifiers = [('SGD Classifier', SGDClassifier(loss = 'hinge',random_state=113)),
#               ('sigmoid SVM', svm.SVC(kernel = 'sigmoid', random_state=113)),
#                ('Linear SVM', svm.SVC(kernel = 'linear', random_state=113))
#              ]
#vc = VotingClassifier(estimators=classifiers)
clf = LogisticRegression(random_state=113)
# Fit 'vc' to the traing set and predict test set labels
clf.fit(X_train, y_train)
y_pred_train=clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print("Training Accuracy score:",accuracy_score(y_train, y_pred_train))
print("Testing Accuracy score:",accuracy_score(y_test, y_pred_test))

Training Accuracy score: 1.0
Testing Accuracy score: 0.5797546012269938


In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm
k = 10 # a number of folds best is 20
skfold = StratifiedKFold(n_splits=k, shuffle=True, random_state=113)

y_valid_pred = 0*Y
y_test_pred = 0

model = MLPClassifier(random_state=113)

for i, (train_index, test_index) in tqdm(enumerate(skfold.split(X, Y))):    
    X_train_fold, X_valid_fold = X[train_index], X[test_index]
    y_train_fold, y_valid_fold = Y.iloc[train_index], Y.iloc[test_index]
    
    print( "\nFold ", i)
    
    fit_model = model.fit(X_train_fold, y_train_fold)
        

    def score_model(model,X_train, X_test, y_train, y_test,
               show_plot=True):   
        y_pred = model.predict(X_test)
        print(f"accuracy_score : {accuracy_score(y_test, y_pred)}")
    
        predictions_comparision = pd.DataFrame({'Actual': y_test.tolist(), 'Predicted': y_pred.tolist()}).sample(25)
        if show_plot == True:
            predictions_comparision.plot(kind="bar", figsize=(12,8),title="Actual vs predicted values")
            print(predictions_comparision.sample(10))    
    
        return {
            "accuracy_score" : accuracy_score(y_test, y_pred)
            }
    score_model(fit_model, X_train_fold, X_valid_fold, y_train_fold, y_valid_fold, show_plot=True)
    
    # Predict value Clipping
    y_test_pred += fit_model.predict_proba(X_T)
    
y_test_pred /= k  # Average test set predictions


In [None]:
np.argmax(y_test_pred, axis = 1)

# predict

In [28]:
sub = pd.read_csv('sample_submission.csv')
sub['first_party_winner'] = clf.predict(X_T)
#sub['first_party_winner'] = np.argmax(y_test_pred, axis = 1)
sub

Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,1
3,TEST_0003,0
4,TEST_0004,1
...,...,...
1235,TEST_1235,0
1236,TEST_1236,0
1237,TEST_1237,0
1238,TEST_1238,0


In [29]:
sub['first_party_winner'].value_counts()

1    674
0    566
Name: first_party_winner, dtype: int64

In [30]:
sub.to_csv('submit_230630(1)_RL.csv', index=False)