[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ogatash-lab/ARES2023ExpData/blob/main/MLR.ipynb)

# Importing Libraries

In [41]:
import sklearn
print(sklearn.__version__)
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from numpy.lib.function_base import vectorize
from sklearn.feature_extraction.text import CountVectorizer
from numpy.core.fromnumeric import size
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import tqdm

1.2.2


# Data-Set

In [42]:
df_2007 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2007.csv", header=0)
df_2008 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2008.csv", header=0)
df_2009 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2009.csv", header=0)
df_2010 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2010.csv", header=0)
df_2011 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2011.csv", header=0)
df_2012 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2012.csv", header=0)
df_2013 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2013.csv", header=0)
df_2014 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2014.csv", header=0)
df_2015 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2015.csv", header=0)
df_2016 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2016.csv", header=0)
df_2017 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2017.csv", header=0)
df_2018 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2018.csv", header=0)
df_2019 = pd.read_csv("https://raw.githubusercontent.com/sho6210/test/main/data/data_set_2019.csv", header=0)

# df_0000 storage list
list_df = [df_2007, df_2008, df_2009, df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019]

# Reading and data splitting

In [43]:
def DataMake(split, metric, list_df):
  # variable
  label_train = []
  label_test = []
  sentences_train = []
  sentences_test = []

  # Split the correct answer labels and target sentences into study and test.
  list_train = list_df[:split]
  list_test = list_df[split:]
  print('metric:', metric)
  print('train:', len(list_train), 'test:', len(list_test))

  # Store data frames for testing in df_test.
  df_test = pd.concat(list_test)

  # Extract and list metric value in a list.
  for i in list_train:
    label_train.append(i[metric].values)
    sentences_train.append(i['Description'].values)
  for i in list_test:
    label_test.append(i[metric].values)
    sentences_test.append(i['Description'].values)

  # data for input
  # metric value
  y_train = np.concatenate(label_train, 0)
  y_test = np.concatenate(label_test, 0)
  # description
  train_sentence = np.concatenate(sentences_train, 0)
  test_sentence = np.concatenate(sentences_test, 0)

  return y_train, y_test, train_sentence, test_sentence, df_test

# Natural Language Processing (train)

In [44]:
def NLP_train(train_sentence):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer
  vectorizer = CountVectorizer(stop_words="english")
  X_train = vectorizer.fit_transform(train_sentence)
  
  return X_train, vectorizer

# Multinomial Logistic Regression (train)

In [45]:
def MLR_train(X_train, y_train):
  # Create a classification model for MLR using vectorized features
  lr = LogisticRegression(C=0.3, random_state=0, n_jobs=-1)
  lr.fit(X_train, y_train)

  return lr



# Natural Language Processing (test)

In [46]:
def NLP_test(test_sentence, vectorizer):
  # Natural Language Processing
  # Creating BoW features with sklearn's CountVectorizer.
  X_test = vectorizer.transform(test_sentence)
  
  return X_test

# Multinomial Logistic Regression (test)

In [47]:
def MLR_test(metric, X_test, y_test, lr):
  # Test data to confirm accuracy.
  y_pred = lr.predict(X_test)
  cm = confusion_matrix(y_test, y_pred)

  # Branching when creating a table.
  if metric == 'AV':
    table = pd.DataFrame(cm, columns=['Predicted P', 'Predicted L', 'Predicted A', 'Predicted N'], index=['Actual P', 'Actual L', 'Actual A', 'Actual N'])
  elif metric == 'AC':
    table = pd.DataFrame(cm, columns=['Predicted L', 'Predicted H'], index=['Actual L', 'Actual H'])
  elif metric == 'PR':
    table = pd.DataFrame(cm, columns=['Predicted N', 'Predicted L', 'Predicted H'], index=['Actual N', 'Actual L', 'Actual H'])
  elif metric == 'UI':
    table = pd.DataFrame(cm, columns=['Predicted N', 'Predicted R'], index=['Actual N', 'Actual R'])
  elif metric == 'S':
    table = pd.DataFrame(cm, columns=['Predicted U', 'Predicted C'], index=['Actual U', 'Actual C'])
  else:
    table = pd.DataFrame(cm, columns=['Predicted N', 'Predicted L', 'Predicted H'], index=['Actual N', 'Actual L', 'Actual H'])

  # Accuracy
  print("accuracy:", accuracy_score(y_test, y_pred))
  print(table)
  print('-'*70)

  # Returns a list containing the prediction results and a data frame for testing.
  return y_pred, df_test



# Example

In [None]:
metrics = ['AV', 'AC', 'PR', 'UI', 'S', 'C', 'I', 'A']
split = 12

for metric in metrics:
  y_train, y_test, train_sentence, test_sentence, df_test = DataMake(split, metric, list_df)

  X_train, vectorizer = NLP_train(train_sentence)
  lr = MLR_train(X_train, y_train)

  X_test = NLP_test(test_sentence, vectorizer)
  MLR_test(metric, X_test, y_test, lr)

metric: AV
train: 11 test: 2
accuracy: 0.8702647514434587
          Predicted P  Predicted L  Predicted A  Predicted N
Actual P          103           54          351            1
Actual L           17         3919         1752           11
Actual A           86         1179        20603            8
Actual N            4           71          151           94
----------------------------------------------------------------------
metric: AC
train: 11 test: 2
accuracy: 0.9405365441487115
          Predicted L  Predicted H
Actual L          934         1026
Actual H          663        25781
----------------------------------------------------------------------
metric: PR
train: 11 test: 2
accuracy: 0.7965427404590902
          Predicted N  Predicted L  Predicted H
Actual N          484          524          776
Actual L          159         4214         3106
Actual H           83         1131        17927
----------------------------------------------------------------------
metric: UI
