# 1. **Importing libraries that are planned to be used in our process**

In [27]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score
## for data
import pandas as pd

import re
import nltk## for language detection



import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 2. Reading the dataset with pandas, dropping null values (calculated earlier at less than 5%) and creating a copy of the dataframe for the modelling process

In [28]:
df = pd.read_csv('justice.csv', delimiter=',', encoding = "utf8")
df.dropna(inplace=True)
df.head()

Unnamed: 0.1,Unnamed: 0,ID,name,href,docket,term,first_party,second_party,facts,facts_len,majority_vote,minority_vote,first_party_winner,decision_type,disposition,issue_area
1,1,50613,Stanley v. Illinois,https://api.oyez.org/cases/1971/70-5014,70-5014,1971,"Peter Stanley, Sr.",Illinois,<p>Joan Stanley had three children with Peter ...,757,5,2,True,majority opinion,reversed/remanded,Civil Rights
2,2,50623,Giglio v. United States,https://api.oyez.org/cases/1971/70-29,70-29,1971,John Giglio,United States,<p>John Giglio was convicted of passing forged...,495,7,0,True,majority opinion,reversed/remanded,Due Process
3,3,50632,Reed v. Reed,https://api.oyez.org/cases/1971/70-4,70-4,1971,Sally Reed,Cecil Reed,"<p>The Idaho Probate Code specified that ""male...",378,7,0,True,majority opinion,reversed/remanded,Civil Rights
4,4,50643,Miller v. California,https://api.oyez.org/cases/1971/70-73,70-73,1971,Marvin Miller,California,"<p>Miller, after conducting a mass mailing cam...",305,5,4,True,majority opinion,vacated/remanded,First Amendment
5,5,50644,Kleindienst v. Mandel,https://api.oyez.org/cases/1971/71-16,71-16,1971,"Richard G. Kleindienst, Attorney General of th...","Ernest E. Mandel, et al.",<p>Ernest E. Mandel was a Belgian professional...,2282,6,3,True,majority opinion,reversed,First Amendment


In [29]:
df1 = df.copy()

In [30]:
df1.drop(columns=['Unnamed: 0', 'docket','name','first_party','second_party', 'issue_area', 
                 'facts_len', 'majority_vote', 'minority_vote', 'href', 'ID','term'], inplace=True)

# 3. Seperating the dataset into target variables and two groups of independent variables, one (df_cat) which requires one-hot encoding to be machine readable and the other (df_nlp) which is text data which needs to be cleaned for features to be engineered from it.

In [31]:
df_cat = df1[['decision_type', 'disposition']]

In [32]:
df_target = df1['first_party_winner']

In [33]:
df_nlp = df1['facts']

# 4. Resetting indices to avoid NaNs during concatenation and performing one-hot encoding

In [34]:
df_cat.reset_index(drop=True, inplace=True)
df_target.reset_index(drop=True, inplace=True)
df_nlp.reset_index(drop=True, inplace=True)

In [35]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
df_target= label_encoder.fit_transform(df_target)

In [36]:
df_target1 = pd.DataFrame(df_target, columns=['first_party_winner'])
df_target1

Unnamed: 0,first_party_winner
0,1
1,1
2,1
3,1
4,1
...,...
3093,1
3094,1
3095,0
3096,1


In [37]:
frames = [df_cat, df_target1]
df_concat = pd.concat(frames, axis=1, join='inner')
df_concat

Unnamed: 0,decision_type,disposition,first_party_winner
0,majority opinion,reversed/remanded,1
1,majority opinion,reversed/remanded,1
2,majority opinion,reversed/remanded,1
3,majority opinion,vacated/remanded,1
4,majority opinion,reversed,1
...,...,...,...
3093,majority opinion,reversed/remanded,1
3094,majority opinion,reversed/remanded,1
3095,majority opinion,affirmed,0
3096,majority opinion,vacated/remanded,1


In [38]:
df_nlp1 = pd.DataFrame(df_nlp, columns=['facts'])

In [39]:
df_nlp1['facts'] = df_nlp1['facts'].str.replace(r'<[^<>]*>', '', regex=True)
df_nlp1

Unnamed: 0,facts
0,Joan Stanley had three children with Peter Sta...
1,John Giglio was convicted of passing forged mo...
2,"The Idaho Probate Code specified that ""males m..."
3,"Miller, after conducting a mass mailing campai..."
4,Ernest E. Mandel was a Belgian professional jo...
...,...
3093,For over a century after the Alaska Purchase i...
3094,"Refugio Palomar-Santiago, a Mexican national, ..."
3095,Tarahrick Terry pleaded guilty to one count of...
3096,Joshua James Cooley was parked in his pickup t...


# 5. Performing initial cleaning and tokenizing the corpus, introducing a function to perform further cleaning and Lemmatization upon the data.

In [40]:
import nltk
nltk.download('punkt')

# corpus = df_nlp1["facts"]
# lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))

corpus = df_nlp1["facts"]
lst_tokens = nltk.tokenize.word_tokenize(corpus.str.cat(sep=" "))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91953\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [41]:
lst_tokens

['Joan',
 'Stanley',
 'had',
 'three',
 'children',
 'with',
 'Peter',
 'Stanley',
 '.',
 'The',
 'Stanleys',
 'never',
 'married',
 ',',
 'but',
 'lived',
 'together',
 'off',
 'and',
 'on',
 'for',
 '18',
 'years',
 '.',
 'When',
 'Joan',
 'died',
 ',',
 'the',
 'State',
 'of',
 'Illinois',
 'took',
 'the',
 'children',
 '.',
 'Under',
 'Illinois',
 'law',
 ',',
 'unwed',
 'fathers',
 'were',
 'presumed',
 'unfit',
 'parents',
 'regardless',
 'of',
 'their',
 'actual',
 'fitness',
 'and',
 'their',
 'children',
 'became',
 'wards',
 'of',
 'the',
 'state',
 '.',
 'Peter',
 'appealed',
 'the',
 'decision',
 ',',
 'arguing',
 'that',
 'the',
 'Illinois',
 'law',
 'violated',
 'the',
 'Equal',
 'Protection',
 'Clause',
 'of',
 'the',
 'Fourteenth',
 'Amendment',
 'because',
 'unwed',
 'mothers',
 'were',
 'not',
 'deprived',
 'of',
 'their',
 'children',
 'without',
 'a',
 'showing',
 'that',
 'they',
 'were',
 'actually',
 'unfit',
 'parents',
 '.',
 'The',
 'Illinois',
 'Supreme',
 'C

In [42]:
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()

In [43]:

def utils_preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    ## clean (convert to lowercase and remove punctuations and characters and then strip)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
            
    ## Tokenize (convert from string to list)
    lst_text = text.split()    ## remove Stopwords
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
                
    ## Stemming (remove -ing, -ly, ...)
    if flg_stemm == True:
        ps = nltk.stem.porter.PorterStemmer()
        lst_text = [ps.stem(word) for word in lst_text]
                
    ## Lemmatisation (convert the word into root word)
    if flg_lemm == True:
        lem = nltk.stem.wordnet.WordNetLemmatizer()
        lst_text = [lem.lemmatize(word) for word in lst_text]
            
    ## back to string from list
    text = " ".join(lst_text)
    return text

In [45]:
import nltk
nltk.download('wordnet')

df_nlp1["facts_clean"] = df_nlp1["facts"].apply(lambda x: utils_preprocess_text(x, flg_stemm=False, flg_lemm=True))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91953\AppData\Roaming\nltk_data...


In [46]:
df_nlp1

Unnamed: 0,facts,facts_clean
0,Joan Stanley had three children with Peter Sta...,joan stanley had three child with peter stanle...
1,John Giglio was convicted of passing forged mo...,john giglio wa convicted of passing forged mon...
2,"The Idaho Probate Code specified that ""males m...",the idaho probate code specified that male mus...
3,"Miller, after conducting a mass mailing campai...",miller after conducting a mass mailing campaig...
4,Ernest E. Mandel was a Belgian professional jo...,ernest e mandel wa a belgian professional jour...
...,...,...
3093,For over a century after the Alaska Purchase i...,for over a century after the alaska purchase i...
3094,"Refugio Palomar-Santiago, a Mexican national, ...",refugio palomarsantiago a mexican national wa ...
3095,Tarahrick Terry pleaded guilty to one count of...,tarahrick terry pleaded guilty to one count of...
3096,Joshua James Cooley was parked in his pickup t...,joshua james cooley wa parked in his pickup tr...


# 6. Introducing Count Vectorizer to derive features from textual data.

In [47]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
vectorizer = CountVectorizer()
x = vectorizer.fit_transform(df_nlp1['facts_clean'])

In [49]:
x1 = x.toarray()

In [50]:
x

<3098x20357 sparse matrix of type '<class 'numpy.int64'>'
	with 290178 stored elements in Compressed Sparse Row format>

In [51]:
df.reset_index(drop=True, inplace=True)

In [52]:
df_nlp2 = pd.concat([df_nlp1,df_target1['first_party_winner']],axis=1, join='inner')

In [53]:
df_nlp2

Unnamed: 0,facts,facts_clean,first_party_winner
0,Joan Stanley had three children with Peter Sta...,joan stanley had three child with peter stanle...,1
1,John Giglio was convicted of passing forged mo...,john giglio wa convicted of passing forged mon...,1
2,"The Idaho Probate Code specified that ""males m...",the idaho probate code specified that male mus...,1
3,"Miller, after conducting a mass mailing campai...",miller after conducting a mass mailing campaig...,1
4,Ernest E. Mandel was a Belgian professional jo...,ernest e mandel wa a belgian professional jour...,1
...,...,...,...
3093,For over a century after the Alaska Purchase i...,for over a century after the alaska purchase i...,1
3094,"Refugio Palomar-Santiago, a Mexican national, ...",refugio palomarsantiago a mexican national wa ...,1
3095,Tarahrick Terry pleaded guilty to one count of...,tarahrick terry pleaded guilty to one count of...,0
3096,Joshua James Cooley was parked in his pickup t...,joshua james cooley wa parked in his pickup tr...,1


In [54]:
xfeatures = df_nlp2['facts_clean']
ylabel = df_nlp2['first_party_winner']

# 7. Using sklearn train_test_split and Pipeline to fit and score the model on Logistic Regression, RandomForest and K-NearestNeighbors on the newly engineered features

In [55]:
pipe = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression(solver='liblinear'))])

In [56]:
pipe.fit(X_train,y_train)

NameError: name 'X_train' is not defined

In [None]:
pipe.score(X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
pipe1= Pipeline(steps=[('cv',CountVectorizer()),('rf',RandomForestClassifier())])

In [None]:
pipe1.fit(X_train,y_train)

In [None]:
pipe1.score(X_test,y_test)

In [None]:
pipe2= Pipeline(steps=[('cv',CountVectorizer()),('rf',KNeighborsClassifier(n_neighbors=3))])

In [None]:
pipe2.fit(X_train,y_train)

In [None]:
pipe2.score(X_test,y_test)

# 8. Now including the one-hot encoded features within our model.

In [None]:
df_nl1=pd.concat([df_nlp2,df_cat],axis=1,join='inner')

In [None]:
df_nl1

In [None]:
xfeatures11 = df_nl1[['facts_clean','decision_type','disposition']]
ylabel11 = df_nl1['first_party_winner']

In [None]:
df_cat1=pd.get_dummies(df_cat['decision_type'])

In [None]:
df_cat2=pd.get_dummies(df_cat['disposition'])

In [None]:
df_cat3=pd.concat([df_cat2,df_cat1],axis=1,join='inner')

In [None]:
df_cat3=pd.concat([df_cat3,df_nl1['first_party_winner']],axis=1,join='inner')

In [None]:
vectorize=CountVectorizer()

In [None]:
count_matrix = vectorize.fit_transform(df_nl1['facts_clean'])
count_array = count_matrix.toarray()
data_hello = pd.DataFrame(data=count_array,columns = vectorize.get_feature_names())

In [None]:
data_hello=pd.concat([data_hello,df_cat3],axis=1,join='inner')

In [None]:
data_hello

In [None]:
X=data_hello.drop(columns=['first_party_winner'])
Y=data_hello['first_party_winner']

# 9. Using Principal Component Analysis to perform dimensionality reduction and measure the accuracy trade-off

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.3,random_state=10)

In [None]:
#XX=final_data.drop(columns=['first_party_winner'])
#YY=final_data['first_party_winner']

In [None]:
#from sklearn.preprocessing import MinMaxScaler
#scaler = MinMaxScaler()
#X_train = scaler.fit_transform(X_train)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=400)
pca_fit = pca.fit_transform(X_train)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
print(pca.explained_variance_ratio_.sum())

In [None]:
pca_df = pd.DataFrame(data = pca_fit)
pca_df.head(3)

In [None]:
final_data=pd.concat([pca_df,Y],axis=1,join='inner')

In [None]:
final_data

In [None]:
XX=final_data.drop(columns=['first_party_winner'])
YY=final_data[['first_party_winner']]

In [None]:
rand=RandomForestClassifier()

In [None]:
rand.fit(XX,YY)

In [None]:
rand.score(XX,YY)

In [None]:
pca = PCA(n_components=400)
pca_fit = pca.fit_transform(X_test)
X_test = pd.DataFrame(data = pca_fit)

In [None]:
rand.score(X_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {  'bootstrap': [True], 'max_depth': [5, 10, None], 'max_features': ['auto', 'log2'], 'n_estimators': [5, 6, 7, 8, 9, 10, 11, 12, 13, 15]}

In [None]:
rand = RandomForestClassifier(random_state = 1)
g_search = GridSearchCV(estimator = rand, param_grid = param_grid,
                        cv = 3, n_jobs = 1, verbose = 0, return_train_score=True)

In [None]:
g_search.fit(XX, YY);
print(g_search.best_params_)

In [None]:
print(g_search.score(X_test, y_test))

In [None]:
rand=RandomForestClassifier(bootstrap= True, max_depth= 5, max_features= 'log2', n_estimators= 15)

In [None]:
rand.fit(XX,YY)

In [None]:
y_pred = rand.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
from sklearn.metrics import f1_score

In [None]:
f1_score(y_test, y_pred)

In [None]:
y_pred1 = rand.predict(XX)
f1_score(YY, y_pred1)

In [None]:
model = XGBClassifier()
model.fit(XX, YY)
y_pred1 = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))
f1_score(y_test, y_pred1)

In [None]:
knn=KNeighborsClassifier(n_neighbors=3)

In [None]:
knn.fit(XX,YY)

In [None]:
knn.score(X_test,y_test)

# 10. Deploying a LSTM Model to achieve higher accuracy

In [None]:
import numpy as np
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional

In [None]:
n_unique_words = 10000 # cut texts after this number of words
maxlen = 2000
batch_size = 32

In [None]:
model = Sequential()
model.add(Embedding(n_unique_words, 128, input_length=maxlen))
model.add(LSTM(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
history=model.fit(XX, YY,
           batch_size=batch_size,
           epochs=200,
           validation_data=[X_test, y_test])

# **Thank you for taking the time to go through our submission. Any feedback is always welcome.**