## Null Values

In [8]:
import pandas as pd
import numpy as np

In [10]:
df = pd.DataFrame({'value1':[100,np.nan,234,np.nan],
                  'value2':[300,121,np.nan,np.nan],
                  'value3':['XUI','VYU','NMA','IUY']})
df.head()

Unnamed: 0,value1,value2,value3
0,100.0,300.0,XUI
1,,121.0,VYU
2,234.0,,NMA
3,,,IUY


In [11]:
df.isnull()

Unnamed: 0,value1,value2,value3
0,False,False,False
1,True,False,False
2,False,True,False
3,True,True,False


In [12]:
df.isnull().sum()              # Counting the total number of NaN in the dataset.

value1    2
value2    2
value3    0
dtype: int64

In [13]:
df.isna()

Unnamed: 0,value1,value2,value3
0,False,False,False
1,True,False,False
2,False,True,False
3,True,True,False


In [18]:
df.isna().sum()                              # isnull() or isna()  Both are same.

value1    0
value2    0
value3    0
dtype: int64

In [16]:
df.fillna(df.mean(),inplace=True)        # Filling the NaN with mean of each column
df                                       # mean : {100+234}/2 = 167

Unnamed: 0,value1,value2,value3
0,100.0,300.0,XUI
1,167.0,121.0,VYU
2,234.0,210.5,NMA
3,167.0,210.5,IUY


In [22]:
df.mean()                # Verify the mean of each column

value1    167.0
value2    210.5
dtype: float64

In [24]:
df.sort_values(by='value2',ascending=True)

Unnamed: 0,value1,value2,value3
1,167.0,121.0,VYU
2,234.0,210.5,NMA
3,167.0,210.5,IUY
0,100.0,300.0,XUI


In [35]:
df.sort_values(by='value2',ascending=False)                 # False is known as descending order

Unnamed: 0,value1,value2,value3
0,100.0,300.0,XUI
2,234.0,210.5,NMA
3,167.0,210.5,IUY
1,167.0,121.0,VYU


In [36]:
data = pd.read_csv('Data_Sets/1.1 Automobile.csv.csv')

In [38]:
data.sort_values(by='price',ascending=False).head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,number_of_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
71,1,140,mercedes-benz,gas,std,two,hardtop,rwd,front,112.0,...,304,mpfi,3.8,3.35,8.0,184,4500,14,16,45400
15,0,149,bmw,gas,std,two,sedan,rwd,front,103.5,...,209,mpfi,3.62,3.39,8.0,182,5400,16,22,41315
70,0,140,mercedes-benz,gas,std,four,sedan,rwd,front,120.9,...,308,mpfi,3.8,3.35,8.0,184,4500,14,16,40960
125,3,128,porsche,gas,std,two,convertible,rwd,rear,89.5,...,194,mpfi,3.74,2.9,9.5,207,5900,17,25,37028
16,0,149,bmw,gas,std,four,sedan,rwd,front,110.0,...,209,mpfi,3.62,3.39,8.0,182,5400,15,20,36880


In [40]:
data.sort_values(by='price',ascending=False)[['make','price']].head()

Unnamed: 0,make,price
71,mercedes-benz,45400
15,bmw,41315
70,mercedes-benz,40960
125,porsche,37028
16,bmw,36880


In [41]:
data['make'].count()      # Counting how many items are in the number_of_make column

201

In [44]:
data['make'].value_counts()                # Counting each item in the 'make' column

toyota           32
nissan           18
mazda            17
honda            13
mitsubishi       13
subaru           12
volkswagen       12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
plymouth          7
audi              6
saab              6
porsche           4
chevrolet         3
alfa-romero       3
jaguar            3
isuzu             2
renault           2
mercury           1
Name: make, dtype: int64

In [45]:
data['number_of_doors'].value_counts()

four    114
two      87
Name: number_of_doors, dtype: int64

### Assignment : What is the difference between count() and value_counts()

Pandas count value for each row and columns using the dataframe **count()** function. **Count** for each level in a multi-index dataframe. Pandas **value_counts()** method to find frequency of unique values in a series.

# Concatenation

In [46]:
mm = {'one':[2,3,1,4,5],
     'two':[5,4,3,2,1],
     'letter':['a','a','b','b','c']}

In [47]:
mm

{'one': [2, 3, 1, 4, 5],
 'two': [5, 4, 3, 2, 1],
 'letter': ['a', 'a', 'b', 'b', 'c']}

In [48]:
mm1 = pd.DataFrame(mm)
mm1

Unnamed: 0,one,two,letter
0,2,5,a
1,3,4,a
2,1,3,b
3,4,2,b
4,5,1,c


In [49]:
df

Unnamed: 0,value1,value2,value3
0,100.0,300.0,XUI
1,167.0,121.0,VYU
2,234.0,210.5,NMA
3,167.0,210.5,IUY


In [50]:
new_df = pd.concat([df,mm1],sort=True)              # Use the concat() to put together two dataframe and store in a new Variable

In [51]:
new_df

Unnamed: 0,letter,one,two,value1,value2,value3
0,,,,100.0,300.0,XUI
1,,,,167.0,121.0,VYU
2,,,,234.0,210.5,NMA
3,,,,167.0,210.5,IUY
0,a,2.0,5.0,,,
1,a,3.0,4.0,,,
2,b,1.0,3.0,,,
3,b,4.0,2.0,,,
4,c,5.0,1.0,,,


In [52]:
new_df1 = pd.concat([df,mm1],sort=True, axis=1)          # Use axis=1 to put two dataframe side by side

In [53]:
new_df1

Unnamed: 0,value1,value2,value3,one,two,letter
0,100.0,300.0,XUI,2,5,a
1,167.0,121.0,VYU,3,4,a
2,234.0,210.5,NMA,1,3,b
3,167.0,210.5,IUY,4,2,b
4,,,,5,1,c
