In [2]:
import pandas as pd
data = pd.read_csv("forestfires.csv")
pd.set_option("display.max_rows", 5)
data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.0
516,6,3,nov,tue,79.5,3.0,106.7,1.1,11.8,31,4.5,0.0,0.0


# Data Types 
1. `df.dtypes` - gives data types of every column
2. `s.dtype`   - gives data type of column 
3. `s.astype( datatype )`- convert column datatype into given data type 

In [3]:
data.dtypes

X         int64
Y         int64
         ...   
rain    float64
area    float64
Length: 13, dtype: object

In [4]:
data["X"].dtype

dtype('int64')

In [5]:
data["X"].astype('float64')

0      7.0
1      7.0
      ... 
515    1.0
516    6.0
Name: X, Length: 517, dtype: float64

# Misssing Data

`NaN` is always `float64` type. To select `NaN` entries you can use - 
1. `pd.isnull()`
2. `pd.notnull()`
3. `Series.fillna("para")` - para is filled where NaN is given in column.
4. `Series.replace("Value to change", "Changed value")`
5. `DataFrame.sample
6. `sum()` - use this with along with `pd.isnull()`

In [6]:
data[pd.isnull(data.X)]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area


In [7]:
data[pd.notnull(data.X)]

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.0
516,6,3,nov,tue,79.5,3.0,106.7,1.1,11.8,31,4.5,0.0,0.0


In [8]:
data["X"].fillna("unknown")

0      7
1      7
      ..
515    1
516    6
Name: X, Length: 517, dtype: int64

In [9]:
data["X"].replace(7,"unknown")

0      unknown
1      unknown
        ...   
515          1
516          6
Name: X, Length: 517, dtype: object

In [13]:
# It Returns the dataframe with index and values of randomly picked up n number of samples from data
data["X"].sample(10)

39     4
267    2
      ..
173    4
503    2
Name: X, Length: 10, dtype: int64

In [16]:
# frac used to specify percentage of data to be sampled 0.5 then if there is 100 rows 50 are selected randomly, Do not use with 'n'
# weights causes higher values from specified column to be returned, replace allows or disallow repeatation of sampling
data.sample(frac = 0.5, weights="X", replace=True)

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
410,6,3,feb,fri,84.1,7.3,52.8,2.7,14.7,42,2.7,0.0,0.00
219,6,5,mar,mon,90.1,39.7,86.6,6.2,15.2,27,3.1,0.0,31.86
...,...,...,...,...,...,...,...,...,...,...,...,...,...
230,4,4,sep,wed,92.9,133.3,699.6,9.2,26.4,21,4.5,0.0,88.49
383,9,6,aug,thu,91.6,248.4,753.8,6.3,20.5,58,2.7,0.0,42.87
