# Task for Today  

***

## India Power Generation Region Prediction  

Given *data about daily power generation in India*, let's try to predict what **region** a given report is from.  
  
We will use eight different models to make our predictions.

# Getting Started

In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier

In [None]:
data = pd.read_csv('../input/daily-power-generation-in-india-20172020/file_02.csv')

In [None]:
data

In [None]:
data.info()

# Dropping Index Column/Checking Missing Values

In [None]:
data = data.drop('index', axis=1)

In [None]:
data.isna().mean()

In [None]:
for column in ['Nuclear Generation Actual (in MU)', 'Nuclear Generation Estimated (in MU)']:
    data[column] = data[column].fillna(data[column].mean())

In [None]:
print("Total missing values:", data.isna().sum().sum())

# Creating Year and Month Columns

In [None]:
data

In [None]:
data['Year'] = data['Date'].apply(lambda x: np.int(x[0:4]))
data['Month'] = data['Date'].apply(lambda x: np.int(x[5:7]))

data = data.drop('Date', axis=1)

# Removing Commas From Thermal Columns

In [None]:
for column in ['Thermal Generation Actual (in MU)', 'Thermal Generation Estimated (in MU)']:
    data[column] = data[column].apply(lambda x: np.float(x.replace(',', '')))

In [None]:
data

# Encoding Labels

In [None]:
label_encoder = LabelEncoder()

data['Region'] = label_encoder.fit_transform(data['Region'])

In [None]:
data

In [None]:
data.dtypes

# Splitting/Scaling

In [None]:
y = data['Region'].copy()
X = data.drop('Region', axis=1).copy()

In [None]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

# Modeling/Training

In [None]:
models = [
    LogisticRegression(),
    SVC(),
    MLPClassifier(),
    DecisionTreeClassifier(),
    AdaBoostClassifier(),
    BaggingClassifier(),
    GradientBoostingClassifier(),
    RandomForestClassifier()
]

model_names = [
    "         Logistic Regression",
    "      Support Vector Machine",
    "              Neural Network",
    "               Decision Tree",
    "         AdaBoost Classifier",
    "          Bagging Classifier",
    "Gradient Boosting Classifier",
    "    Random Forest Classifier"
]

In [None]:
results = []

for i in range(len(models)):
    models[i].fit(X_train, y_train)
    results.append(models[i].score(X_test, y_test))

# Results

In [None]:
for i in range(len(models)):
    print(model_names[i] + ": {:.5f}".format(results[i]))

# Data Every Day  

This notebook is featured on Data Every Day, a YouTube series where I train models on a new dataset each day.  

***

Check it out!  
https://youtu.be/1KVRzz8jtMk