# 법원 데이터 판결

## EDA

In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [7]:
df.shape

(2478, 5)

In [9]:
cond = df['first_party_winner'] == 1
df[cond].shape # 2478 중 1649게 = first_party

(1649, 5)

## baseline code

In [14]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

vectorizer = TfidfVectorizer()

def get_vector(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts'])
    else:
        X_facts = vectorizer.transform(df['facts'])

    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])

    # Convert sparse matrix to numpy array
    X = np.concatenate([X_party1.toarray(), X_party2.toarray(), X_facts.toarray()], axis=1)
    return X

# Example usage:
# X_train = get_vector(vectorizer, train_df, train_mode=True)
# Y_train = train_df['label']
# model = LogisticRegression()
# model.fit(X_train, Y_train)


In [21]:
X_train = get_vector(vectorizer, train, True)
Y_train = train["first_party_winner"]
X_test = get_vector(vectorizer, test, False)

In [22]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [23]:
submit = pd.read_csv('./sample_submission.csv')

In [24]:
pred = model.predict(X_test)

In [25]:
submit['first_party_winner'] = pred
submit.to_csv('./baseline_submit.csv', index=False)
print('Done')

Done


## tf-idf

In [29]:
import os
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')

# Display the first few rows of the train and test data to understand their structure
train_head = train_data.head()
test_head = test_data.head()

train_head, test_head


(           ID         first_party                    second_party  \
 0  TRAIN_0000   Phil A. St. Amant              Herman A. Thompson   
 1  TRAIN_0001      Stephen Duncan                  Lawrence Owens   
 2  TRAIN_0002   Billy Joe Magwood  Tony Patterson, Warden, et al.   
 3  TRAIN_0003          Linkletter                          Walker   
 4  TRAIN_0004  William Earl Fikes                         Alabama   
 
                                                facts  first_party_winner  
 0  On June 27, 1962, Phil St. Amant, a candidate ...                   1  
 1  Ramon Nelson was riding his bike when he suffe...                   0  
 2  An Alabama state court convicted Billy Joe Mag...                   1  
 3  Victor Linkletter was convicted in state court...                   0  
 4  On April 24, 1953 in Selma, Alabama, an intrud...                   1  ,
           ID                                        first_party  \
 0  TEST_0000                                        

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Splitting the train data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['facts'], train_data['first_party_winner'], test_size=0.2, random_state=42)

# Creating a model pipeline with TF-IDF vectorizer and Multinomial Naive Bayes classifier
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Training the model with the training data
model.fit(X_train, y_train)

# Evaluating the model on the validation set
validation_score = model.score(X_val, y_val)

# Predicting the 'first_party_winner' for the test data
test_predictions = model.predict(test_data['facts'])

validation_score, test_predictions[:30]  # Displaying the validation score and first 10 predictions


(0.6612903225806451,
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1]))

## 불용어처리, 베이스라인 코드

In [35]:
basic_stopwords = set([
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
    "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
    "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves",
    "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are",
    "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does",
    "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until",
    "while", "of", "at", "by", "for", "with", "about", "against", "between", "into",
    "through", "during", "before", "after", "above", "below", "to", "from", "up", "down",
    "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here",
    "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more",
    "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "just", "don", "should", "now" #, "will"
])

# Function to remove stopwords from a text
def remove_basic_stopwords(text):
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word.lower() not in basic_stopwords]
    return ' '.join(filtered_text)

# Applying the function to the 'facts' column of the train data
train_data['facts_cleaned'] = train_data['facts'].apply(remove_basic_stopwords)

# Display the first few rows of the updated train data
train_data[['facts', 'facts_cleaned']].head()

Unnamed: 0,facts,facts_cleaned
0,"On June 27, 1962, Phil St. Amant, a candidate ...","June 27 , 1962 , Phil St. Amant , candidate pu..."
1,Ramon Nelson was riding his bike when he suffe...,Ramon Nelson riding bike suffered lethal blow ...
2,An Alabama state court convicted Billy Joe Mag...,Alabama state court convicted Billy Joe Magwoo...
3,Victor Linkletter was convicted in state court...,Victor Linkletter convicted state court eviden...
4,"On April 24, 1953 in Selma, Alabama, an intrud...","April 24 , 1953 Selma , Alabama , intruder bro..."


In [39]:
# Applying the function to the 'facts' column of the train data
test_data['facts_cleaned'] = test_data['facts'].apply(remove_basic_stopwords)

# Display the first few rows of the updated train data
test_data[['facts', 'facts_cleaned']].head()

Unnamed: 0,facts,facts_cleaned
0,The 1984 Bail Reform Act allowed the federal c...,1984 Bail Reform Act allowed federal courts de...
1,Lexecon Inc. was a defendant in a class action...,Lexecon Inc. defendant class action lawsuit . ...
2,"In 2002 and 2003, Fox Television Stations broa...","2002 2003 , Fox Television Stations broadcast ..."
3,During his trial for armed robbery of a federa...,trial armed robbery federally insured savings ...
4,"In 1993, a magistrate judge issued a warrant a...","1993 , magistrate judge issued warrant authori..."


In [40]:
clean_train_data = train_data.drop(columns=['facts'])
clean_test_data = test_data.drop(columns=['facts'])

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np

vectorizer = TfidfVectorizer()

def get_vector2(vectorizer, df, train_mode):
    if train_mode:
        X_facts = vectorizer.fit_transform(df['facts_cleaned'])
    else:
        X_facts = vectorizer.transform(df['facts_cleaned'])

    X_party1 = vectorizer.transform(df['first_party'])
    X_party2 = vectorizer.transform(df['second_party'])

    # Convert sparse matrix to numpy array
    X = np.concatenate([X_party1.toarray(), X_party2.toarray(), X_facts.toarray()], axis=1)
    return X

# Example usage:
# X_train = get_vector(vectorizer, train_df, train_mode=True)
# Y_train = train_df['label']
# model = LogisticRegression()
# model.fit(X_train, Y_train)


In [43]:
X_train = get_vector2(vectorizer, clean_train_data, True)
Y_train = clean_train_data["first_party_winner"]
X_test = get_vector2(vectorizer, clean_test_data, False)

In [44]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [45]:
submit = pd.read_csv('./sample_submission.csv')

In [46]:
pred = model.predict(X_test)

In [48]:
submit['first_party_winner'] = pred
submit.to_csv('./baseline_submit_cleaned.csv', index=False)
print('Done')

Done


## 3위 코드

In [1]:
import os
import random
import numpy as np
import pandas as pd
import re

from nltk.tokenize import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from imblearn.under_sampling import NeighbourhoodCleaningRule

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [2]:
# Fix Seed
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(42)

In [3]:
# Load Data
train = pd.read_csv('./train.csv').drop('ID', axis=1)
test = pd.read_csv('./test.csv').drop('ID', axis=1)
submission = pd.read_csv('./sample_submission.csv')

In [4]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# 문자열 전처리
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# 전처리 함수 1
def preprocessing(df, cols, shortword, tokenizer, stopword, lemmatizer):
    first_party_lst = []
    second_party_lst = []
    facts_lst = []
    for col in cols:
        # 좌우 공백 제거
        df[col] = df[col].str.strip()
        # 두 칸 이상의 공백 한 칸으로 변경
        df[col] = df[col].str.replace('  ', ' ')
        # 소문자로 변경
        df[col] = df[col].str.lower()
        # ",", "." 제거
        df[col] = df[col].str.replace(',','')
        df[col] = df[col].str.replace('.','')

        if col == 'first_party':
            for sample in df[col]:
                # 한글자 단어 제거
                sample = shortword.sub('', sample)
                # 특수문자 제거
                sample = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sample)
                # tokenzier를 이용한 단어 토큰화
                token = tokenizer.tokenize(sample)
                # 불용어 제거
                new_token = []
                for tok in token:
                    if tok not in stopword:
                        # 표제어 추출
                        new_token.append(lemmatizer.lemmatize(tok, 'n'))
                first_party_lst.append(new_token)
            # sklearn.feature_extraction 변환을 위해 단어들을 결합
            for i in range(len(first_party_lst)):
                first_party_lst[i] = ' '.join(first_party_lst[i])

        elif col == 'second_party':
            for sample in df[col]:
                # 한글자 단어 제거
                sample = shortword.sub('', sample)
                # 특수문자 제거
                sample = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sample)
                # tokenzier를 이용한 단어 토큰화
                token = tokenizer.tokenize(sample)
                # 불용어 제거
                new_token = []
                for tok in token:
                    if tok not in stopword:
                        # 표제어 추출
                        new_token.append(lemmatizer.lemmatize(tok, 'n'))
                second_party_lst.append(new_token)
            # sklearn.feature_extraction 변환을 위해 단어들을 결합
            for i in range(len(second_party_lst)):
                second_party_lst[i] = ' '.join(second_party_lst[i])

        elif col=='facts':
            for sample in df[col]:
                # 한글자 단어 제거
                sample = shortword.sub('', sample)
                # 특수문자 제거
                sample = re.sub(r"[^\uAC00-\uD7A30-9a-zA-Z\s]", "", sample)
                # tokenzier를 이용한 단어 토큰화
                token = tokenizer.tokenize(sample)
                # 불용어 제거
                new_token = []
                for tok in token:
                    if tok not in stopword:
                        new_token.append(tok)
                facts_lst.append(new_token)
            # sklearn.feature_extraction 변환을 위해 단어들을 결합
            for i in range(len(facts_lst)):
                facts_lst[i] = ' '.join(facts_lst[i])

        else:
            print('컬럼이름을 변경하지 말아주세요!')

    return first_party_lst, second_party_lst, facts_lst

# 전처리 함수 2(벡터화)
def preprocessing_2(first, second, facts, vec, vec_facts, train=True):
    if train:
        vec.fit(first + second)
        vec_facts.fit(facts)

    X1 = vec.transform(first).toarray()
    X2 = vec.transform(second).toarray()
    X3 = vec_facts.transform(facts).toarray()

    return np.concatenate([X1, X2, X3], axis=1)

In [6]:
import nltk
# 문자열 전처리 1
cols = ['first_party', 'second_party', 'facts']
shortword = re.compile(r'\W*\b\w{1}\b')
tokenizer = TreebankWordTokenizer()
stopword = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

first_train, second_train, facts_train = preprocessing(train, cols, shortword, tokenizer, stopword, lemmatizer)
first_test, second_test, facts_test = preprocessing(test, cols, shortword, tokenizer, stopword, lemmatizer)

# 문자열 전처리 2(벡터화)
vec = CountVectorizer(ngram_range=(1,2))
vec_facts = TfidfVectorizer(ngram_range=(1,2))

X_train = preprocessing_2(first_train, second_train, facts_train, vec, vec_facts)
y_train = train['first_party_winner']
X_test = preprocessing_2(first_test, second_test, facts_test, vec, vec_facts, train=False)

  df[col] = df[col].str.replace('.','')


In [7]:
print('<train 데이터>')
print(X_train.shape, y_train.shape)
print()
print('<test 데이터>')
print(X_test.shape)

<train 데이터>
(2478, 211292) (2478,)

<test 데이터>
(1240, 211292)


### 램부족

In [None]:
# 데이터 불균형 문제 전처리(언더샘플링)
X_nc, y_nc = NeighbourhoodCleaningRule(n_neighbors=3).fit_resample(X_train, y_train)
# 램이 부족하다
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(y_nc.value_counts())

In [None]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

In [None]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

In [None]:
submission['first_party_winner'] = Logistic.predict(X_test)
submission.to_csv('logi___2.csv', index=False)

### 램부족 해결

In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
import numpy as np
import gc

def batch_process_neighbourhood_cleaning_rule(X, y, batch_size, n_neighbors=3):
    # 전체 데이터셋의 크기 계산
    total_size = X.shape[0]

    # 배치 처리를 위해 데이터를 배치 크기로 나누기
    indices = np.arange(total_size)
    np.random.shuffle(indices)

    X_resampled, y_resampled = [], []

    for start in range(0, total_size, batch_size):
        end = min(start + batch_size, total_size)
        batch_indices = indices[start:end]

        # 배치에 대해 NeighbourhoodCleaningRule 적용
        X_batch, y_batch = X[batch_indices], y[batch_indices]
        X_batch_resampled, y_batch_resampled = NeighbourhoodCleaningRule(n_neighbors=n_neighbors).fit_resample(X_batch, y_batch)

        # 결과 저장
        X_resampled.append(X_batch_resampled)
        y_resampled.append(y_batch_resampled)
        gc.collect()

    # 모든 배치의 결과를 하나로 합침
    X_resampled = np.vstack(X_resampled)
    y_resampled = np.concatenate(y_resampled)

    return X_resampled, y_resampled

# 배치 크기 설정 (메모리 상황에 맞게 조정)
batch_size = 20  # 예시로 10000 설정

# 배치 처리 수행
X_nc, y_nc = batch_process_neighbourhood_cleaning_rule(X_train, y_train, batch_size)

# 결과 출력
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(pd.Series(y_nc).value_counts())


In [None]:
# Train, Validation 분리
Train_X, Val_X, Train_y, Val_y = train_test_split(X_nc, y_nc, test_size=.25, random_state=42, stratify=y_nc)
print('Train Data Shape')
print(Train_X.shape, Train_y.shape)
print('-'*20)
print('Train target')
print(Train_y.value_counts())
print('='*20)
print('Validation Data Shape')
print(Val_X.shape, Val_y.shape)
print('-'*20)
print('Validation target')
print(Val_y.value_counts())

In [None]:
Logistic = LogisticRegression(max_iter=500, random_state=42)
Logistic.fit(Train_X, Train_y)
print(classification_report(Val_y, Logistic.predict(Val_X)))

In [None]:
submission['first_party_winner'] = Logistic.predict(X_test)
submission.to_csv('logi___2.csv', index=False)

### 램부족 해결 - 2

In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule
import numpy as np
import gc


def optimized_batch_process_neighbourhood_cleaning_rule(X, y, batch_size, n_neighbors=3):
    # 전체 데이터셋의 크기 계산
    total_size = X.shape[0]

    # 데이터셋 전체에 대해 NeighbourhoodCleaningRule 적용
    ncr = NeighbourhoodCleaningRule(n_neighbors=n_neighbors)
    X_resampled_full, y_resampled_full = ncr.fit_resample(X, y)

    # 결과 데이터를 배치로 나누기
    X_resampled_batches, y_resampled_batches = [], []

    for start in range(0, total_size, batch_size):
        end = min(start + batch_size, total_size)

        X_batch_resampled = X_resampled_full[start:end]
        y_batch_resampled = y_resampled_full[start:end]

        X_resampled_batches.append(X_batch_resampled)
        y_resampled_batches.append(y_batch_resampled)

        # 메모리 관리
        gc.collect()

    # 모든 배치의 결과를 하나로 합침
    X_resampled = np.vstack(X_resampled_batches)
    y_resampled = np.concatenate(y_resampled_batches)

    return X_resampled, y_resampled

# 이 코드는 메모리 사용량을 최소화하고 효율을 높이기 위해 NeighbourhoodCleaningRule을 전체 데이터셋에 한 번만 적용한 후
# 결과를 배치로 나누는 방식으로 변경되었습니다. 이 방법은 전체 데이터셋의 크기가 메모리에 맞는 경우에만 효과적입니다.


# 배치 크기 설정 (메모리 상황에 맞게 조정)
batch_size = 20  # 예시로 10000 설정

# 배치 처리 수행
X_nc, y_nc = optimized_batch_process_neighbourhood_cleaning_rule(X_train, y_train, batch_size)

# 결과 출력
print('Train Data Shape after UnderSampling')
print(X_nc.shape, y_nc.shape)
print('='*20)
print('Train target after UnderSampling')
print(pd.Series(y_nc).value_counts())

# Modified code for optimized batch processing with memory management
