# Task for Today  

***

## Company Market Cap Prediction  
  
Given *data about big companies*, let's try to predict the **market capitalization** of a given company.  
  
We will use a variety of regression models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
data = pd.read_csv('../input/fortune-500-data-2021/Fortune_1000.csv')

In [None]:
data

In [None]:
data.info()

# Preprocessing

In [None]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Drop unused columns
    df = df.drop(['rank', 'rank_change', 'company', 'newcomer', 'prev_rank', 'CEO', 'Website', 'Ticker'], axis=1)
    
    # Encode missing values
    df['Market Cap'] = df['Market Cap'].replace('-', np.NaN).astype(np.float)
    
    # Drop missing target rows
    missing_target_rows = df[df['Market Cap'].isna()].index
    df = df.drop(missing_target_rows, axis=0).reset_index(drop=True)
    
    # Fill remaining missing values
    df['profit'] = df['profit'].fillna(df['profit'].mean())
    
    # Binary encoding
    for column in ['ceo_founder', 'ceo_woman', 'profitable']:
        df[column] = df[column].replace({'no': 0, 'yes': 1})
    
    # One-hot encoding
    for column in ['sector', 'city', 'state']:
        dummies = pd.get_dummies(df[column], prefix=column)
        df = pd.concat([df, dummies], axis=1)
        df = df.drop(column, axis=1)
    
    # Split df into X and y
    y = df['Market Cap']
    X = df.drop('Market Cap', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

In [None]:
X_train

In [None]:
y_train

# Training

In [None]:
models = {
    "     Linear Regression": LinearRegression(),
    "Linear Regression (L2)": Ridge(),
    "Linear Regression (L1)": Lasso(),
    "         Decision Tree": DecisionTreeRegressor(),
    "        Neural Network": MLPRegressor(),
    "         Random Forest": RandomForestRegressor(),
    "     Gradient Boosting": GradientBoostingRegressor()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

# Results

In [None]:
for name, model in models.items():
    y_pred = model.predict(X_test)
    rmse = np.sqrt(np.mean((y_test - y_pred)**2))
    print(name + " RMSE: {:.2f}".format(rmse))

In [None]:
for name, model in models.items():
    r2 = model.score(X_test, y_test)
    print(name + " R^2 Score: {:.5f}".format(r2))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/fiwSBIS6N9c