# Task for Today  

***

## Investment Bank Program Type Prediction  

Given *data about investment bank program transactions*, let's try to predict the **type** of a given program.  
  
We will use a logistic regression model to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [None]:
data = pd.read_csv('../input/data-for-investing-type-prediction/investing_program_prediction_data.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Remove non-numeric characters in PE features
    for column in ['PE' + str(i) for i in range(1, 16)]:
        df[column] = df[column].apply(lambda x: x[1]).astype(np.int)
    
    # One-hot encode the SE2 column
    geo_dummies = pd.get_dummies(df['SE2'])
    df = pd.concat([df, geo_dummies], axis=1)
    df = df.drop('SE2', axis=1)
    
    # Engineer age features
    df['Child'] = df['SE1'].apply(lambda x: 1 if x < 18 else 0)
    df['Senior'] = df['SE1'].apply(lambda x: 1 if x >= 65 else 0)
    
    # Engineer activity features
    for column in ['BA' + str(i) for i in range(1, 8)]:
        df['Low_' + column] = df[column].apply(lambda x: 1 if x <= df[column].quantile(0.25) else 0)
    
    # Split df into X and y
    y = df['InvType']
    X = df.drop('InvType', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X with a standard scaler
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

With no feature engineering:  
67.98%  
  
With age features:  
69.67%  
  
With age and activity features:  
71.15%

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)
print("Test Accuracy: {:.2f}%".format(acc * 100))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/vBhGvRAqBQ0