## Importing the libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from modules import data_wrangle

# 1. Reading the dataset

In [2]:
# Reading train and test dataset
df_train = pd.read_csv("dataset/train.csv")
df_test = pd.read_csv("dataset/test.csv")

# 2. Inspecting the dataset

## i. Viewing head

In [3]:
display(df_train.head())

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type,efficiency
0,0,7.817315,576.17927,41.24308670850264,32.135501,4.0,0.803199,37.403527,1.963787,13.691147,62.494044,12.82491203459621,1018.8665053152532,A1,,,0.562096
1,1,24.785727,240.003973,1.3596482765960705,19.97746,8.0,0.479456,21.843315,0.241473,27.545096,43.851238,12.012043660984917,1025.6238537572883,D4,E00,dual-axis,0.396447
2,2,46.652695,687.612799,91.26536837560256,1.496401,4.0,0.822398,48.222882,4.1918,43.363708,,1.814399755560454,1010.9226539809572,C3,E00,,0.573776
3,3,53.339567,735.141179,96.1909552117616,18.491582,3.0,0.837529,46.295748,0.960567,57.720436,67.361473,8.736258932034128,1021.8466633134252,A1,,dual-axis,0.629009
4,4,5.575374,12.241203,27.495073003585222,30.722697,6.0,0.551833,0.0,0.898062,6.786263,3.632,0.52268384077164,1008.5559577591928,B2,E00,fixed,0.341874


In [4]:
display(df_test.head())

Unnamed: 0,id,temperature,irradiance,humidity,panel_age,maintenance_count,soiling_ratio,voltage,current,module_temperature,cloud_coverage,wind_speed,pressure,string_id,error_code,installation_type
0,0,17.618379,85.449838,90.81542277591532,13.910963,6.0,0.889765,6.370396,0.069101,19.517274,33.509889,7.1819582155525445,1034.782455188643,C3,E01,tracking
1,1,34.826323,722.801748,20.98299301574633,20.916528,4.0,0.590372,30.095867,1.713852,37.421443,32.32706,4.184581989921003,992.3197520437312,D4,E00,
2,2,33.776934,485.491998,55.61404977577451,1.446962,3.0,0.611425,28.42443,1.696936,32.147763,69.613333,6.25944104775485,999.2134568588948,D4,E01,dual-axis
3,3,18.584189,350.02272,49.04476645510075,18.810133,5.0,,7.848038,0.787188,25.734118,42.86276,2.7696074663593944,1026.650078215452,C3,E02,dual-axis
4,4,43.044908,437.295622,8.761571340027164,,8.0,0.564938,12.300717,1.86762,,51.025763,11.846974043208318,1010.809942771749,B2,,fixed


**Observation:**

* There are nulls visible in the dataset
* The target column here is `efficiency` and it is a continuous variable

## ii. Dataset shape

In [5]:
print(f"There are {df_train.shape[0]:,} rows & {df_train.shape[1]} columns in the train data")
print(f"There are {df_test.shape[0]:,} rows & {df_test.shape[1]} columns in the test data")

There are 20,000 rows & 17 columns in the train data
There are 12,000 rows & 16 columns in the test data


## iii. Null value count

In [6]:
df_train.isna().sum()

id                       0
temperature           1001
irradiance             987
humidity                 0
panel_age             1011
maintenance_count     1027
soiling_ratio         1010
voltage                993
current                977
module_temperature     978
cloud_coverage        1010
wind_speed               0
pressure                 0
string_id                0
error_code            5912
installation_type     5028
efficiency               0
dtype: int64

**Observation:** We can observe there are nulls in 11 columns

## iv. Inspecting data types

In [7]:
df_train.dtypes

id                      int64
temperature           float64
irradiance            float64
humidity               object
panel_age             float64
maintenance_count     float64
soiling_ratio         float64
voltage               float64
current               float64
module_temperature    float64
cloud_coverage        float64
wind_speed             object
pressure               object
string_id              object
error_code             object
installation_type      object
efficiency            float64
dtype: object

In [8]:
df_test.dtypes

id                      int64
temperature           float64
irradiance            float64
humidity               object
panel_age             float64
maintenance_count     float64
soiling_ratio         float64
voltage               float64
current               float64
module_temperature    float64
cloud_coverage        float64
wind_speed             object
pressure               object
string_id              object
error_code             object
installation_type      object
dtype: object

**Observation:** The data types of columns `humidity`, `wind_speed` & `pressure` in the above output is of type **object**. Ideally, they should be of type **float**. 

# 3. Data Wrangling

## i. Fixing the data types of columns

In [9]:
df_train["humidity"].value_counts()[:5]

humidity
unknown              50
error                40
badval               37
49.75405575152114     1
4.749053121788882     1
Name: count, dtype: int64

In [10]:
df_train["wind_speed"].value_counts()[:5]

wind_speed
badval               42
error                41
unknown              36
12.82491203459621     1
4.653074989366715     1
Name: count, dtype: int64

In [11]:
df_train["pressure"].value_counts()[:5]

pressure
unknown               46
error                 45
badval                44
1008.5019530390653     1
1015.6842178751405     1
Name: count, dtype: int64

**Observation:**

* The values such as `['unknown', 'error', 'badval']` in the above columns most probably appeared due to the fault in sensors
* The above values will be replaced by `NaN`

In [12]:
# Creating copies of original dataframes
df_train_clnd = df_train.copy(deep=True)
df_test_clnd = df_test.copy(deep=True)

In [13]:
# 
df_train_clnd.replace(["unknown", "error", "badval"], np.nan, inplace=True)
df_test_clnd.replace(["unknown", "error", "badval"], np.nan, inplace=True)

In [14]:
df_train_clnd = data_wrangle.fix_dtypes(
  df=df_train_clnd,
  columns=["humidity", "wind_speed", "pressure"]
)

df_test_clnd = data_wrangle.fix_dtypes(
  df=df_test_clnd,
  columns=["humidity", "wind_speed", "pressure"]
)