# *Data Analysis for Simulation Data*

### *Code by - OM*

In [None]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sklearn

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

In [None]:
# Read & merge the data 
def create_merged_dataset(path):
    files = os.listdir(path)
    timestamps = 0
    c = 1
    data = pd.DataFrame()
    for name in files:
        temp = pd.read_excel("Datasets/"+name)
        if(timestamps==0):
            data.insert(0,"Timestamps",temp.iloc[:,0])
            timestamps = 1
    
        data.insert(c,name[:len(name)-5],temp.iloc[:,1])
        c+=1
    data.set_index("Timestamps")
    data.to_csv("Simulated_dataset.csv")    
    
create_merged_dataset("Datasets")

## *Data Analysis*

In [None]:
df = pd.read_csv("Datasets/Simulated_dataset.csv", index_col="Timestamps")
df.head()

Unnamed: 0_level_0,Unnamed: 0,Cell_Voltage,SOC,Drive_Cycle,Current,Temperature
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0.0,0,3.5,1.0,0.0,0.0,298.15
0.5,1,3.5,1.0,0.0,0.0,298.149991
1.0,2,3.5,1.0,0.0,-15.947959,298.149983
1.5,3,3.460239,0.999877,0.0,-15.947959,298.150435
2.0,4,3.455171,0.999754,0.0,-15.947959,298.150893


In [None]:
# Statistics of dataset 
print("--------- Statistics of dataset: ---------")
print("Data entries: ",df.shape[0])
print("Features: ",df.shape[1])

print("\n\n--------- Datatypes: ---------")
pd.DataFrame(df.dtypes)

--------- Statistics of dataset: ---------
Data entries:  1048575
Features:  6


--------- Datatypes: ---------


Unnamed: 0,0
Unnamed: 0,int64
Cell_Voltage,float64
SOC,float64
Drive_Cycle,float64
Current,float64
Temperature,float64


In [None]:
df.drop(labels=['Unnamed: 0'],axis=1,inplace=True)

In [None]:
# Find the missing values 
df.replace("?", np.nan, inplace=True)

missing_data = df.isnull()
print("---------Number of missing entries:--------- \n")
pd.DataFrame(df.shape[0]-missing_data.count(),columns=['Missing count'])

---------Number of missing entries:--------- 



Unnamed: 0,Missing count
Cell_Voltage,0
SOC,0
Drive_Cycle,0
Current,0
Temperature,0


In [None]:
df.head()

Unnamed: 0_level_0,Cell_Voltage,SOC,Drive_Cycle,Current,Temperature
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,3.5,1.0,0.0,0.0,298.15
0.5,3.5,1.0,0.0,0.0,298.149991
1.0,3.5,1.0,0.0,-15.947959,298.149983
1.5,3.460239,0.999877,0.0,-15.947959,298.150435
2.0,3.455171,0.999754,0.0,-15.947959,298.150893


<h2 id="data_standardization">Data Standardization / Data Normalization</h2>
<p>
Data is usually collected from different agencies with different formats.
(Data Standardization is also a term for a particular type of data normalization, where we subtract the mean and divide by the standard deviation)
</p>
    
<b>What is Standardization?</b>
<p>Standardization is the process of transforming data into a common format which allows the researcher to make the meaningful comparison.
</p>

<b>Example</b>


<p>The formula for unit conversion is<p>
L/100km = 235 / mpg
<p>We can do many mathematical operations directly in Pandas.</p>

In [None]:
cols = df.columns

# Create new dataframe copy
data = df.copy(deep=True)

for col in cols:
    df[col] = df[col]/df[col].max()

df.head()

Unnamed: 0_level_0,Cell_Voltage,SOC,Drive_Cycle,Current,Temperature
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,0.949642,1.0,0.0,0.0,0.30863
0.5,0.949642,1.0,0.0,0.0,0.30863
1.0,0.949642,1.0,0.0,-0.629183,0.30863
1.5,0.938854,0.999877,0.0,-0.629183,0.30863
2.0,0.937479,0.999754,0.0,-0.629183,0.308631


## *Split data - Train-Test Split & features and labels*

In [None]:
#  Split the data into features and labels

x = df[['Cell_Voltage', 'Current', 'Drive_Cycle','Temperature']]
y = df[['SOC']]
print("Data splitting complete.....")

Data splitting complete.....


In [None]:
x.head()

Unnamed: 0_level_0,Cell_Voltage,Current,Drive_Cycle,Temperature
Timestamps,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.0,0.949642,0.0,0.0,0.30863
0.5,0.949642,0.0,0.0,0.30863
1.0,0.949642,-0.629183,0.0,0.30863
1.5,0.938854,-0.629183,0.0,0.30863
2.0,0.937479,-0.629183,0.0,0.308631


In [None]:
train_x, test_x, train_y, test_y = train_test_split(x,y,test_size=0.2, random_state=42)

## *Model development & Testing*

In [None]:
rfregressor = RandomForestRegressor(random_state=42)

In [None]:
rfregressor.fit(train_x, train_y.values.ravel())

RandomForestRegressor(random_state=42)

In [None]:
preds = rfregressor.predict(test_x)

In [None]:
accuracy = (rfregressor.score(test_x, test_y))
print("\n------ Accuracy Score =",accuracy*100,"% ----------")


------ Accuracy Score = 99.66676640764379 % ----------
