## Introduction to Pandas

#### 1. Reading a csv file

In [1]:
# Check the current working directory
import os
os.getcwd()

'C:\\Users\\DELL\\Desktop\\ML\\05032018'

In [2]:
# Set the working directory
os.chdir("C:\\Users\\DELL\\Desktop\\ML\\05032018")

#Read the cars.csv data
import pandas as pd
cars = pd.read_csv("cars.csv")

#### 2. Some initial steps with data

In [3]:
#A. Check the dimension of the data set
cars.shape

(406, 9)

In [5]:
#B. Number of rows in the data set
len(cars.index)   #.index is used to get rows

406

In [None]:
cars.index # Names of rows

In [6]:
#OR Simply
len(cars)

406

In [7]:
#OR
cars.shape[0]

406

In [9]:
#C. Number of columns in the data set
cars.columns  # Names of columns

len(cars.columns)

9

In [10]:
#OR
cars.shape[1]

9

In [11]:
#Studying the variable types
cars.dtypes    #Object means its categorical variable

Car              object
MPG             float64
Cylinders         int64
Displacement    float64
Horsepower        int64
Weight            int64
Acceleration    float64
Model             int64
Origin           object
dtype: object

In [12]:
#D. Getting the variable names
cars.columns

Index(['Car', 'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
       'Acceleration', 'Model', 'Origin'],
      dtype='object')

In [13]:
#OR
list(cars)  #this will return list

['Car',
 'MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model',
 'Origin']

In [15]:
#E. Printing the first 5 lines of the data set
cars.head()   # below format is called as Pandas Data Frame

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130,3504,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150,3436,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150,3433,12.0,70,US
4,Ford Torino,17.0,8,302.0,140,3449,10.5,70,US


In [16]:
#F. Printing the first 10 lines of the data set
cars.head(10)

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130,3504,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150,3436,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150,3433,12.0,70,US
4,Ford Torino,17.0,8,302.0,140,3449,10.5,70,US
5,Ford Galaxie 500,15.0,8,429.0,198,4341,10.0,70,US
6,Chevrolet Impala,14.0,8,454.0,220,4354,9.0,70,US
7,Plymouth Fury iii,14.0,8,440.0,215,4312,8.5,70,US
8,Pontiac Catalina,14.0,8,455.0,225,4425,10.0,70,US
9,AMC Ambassador DPL,15.0,8,390.0,190,3850,8.5,70,US


In [17]:
#G. Printing the last 5 lines of the data set
cars.tail()

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
401,Ford Mustang GL,27.0,4,140.0,86,2790,15.6,82,US
402,Volkswagen Pickup,44.0,4,97.0,52,2130,24.6,82,Europe
403,Dodge Rampage,32.0,4,135.0,84,2295,11.6,82,US
404,Ford Ranger,28.0,4,120.0,79,2625,18.6,82,US
405,Chevy S-10,31.0,4,119.0,82,2720,19.4,82,US


In [19]:
#H. Printing the entire data
#cars

#### 3. Choosing a single variable

In [20]:
#Method1 - Use Square brackets
cars["MPG"].head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: MPG, dtype: float64

In [21]:
cars["Cylinders"].head()

0    8
1    8
2    8
3    8
4    8
Name: Cylinders, dtype: int64

In [23]:
#Method2 - Use dot (.)
cars.Model.head()

0    70
1    70
2    70
3    70
4    70
Name: Model, dtype: int64

#### 4. Some Basic Statistical Functions

In [24]:
#sum
cars.MPG.sum()

9358.800000000003

In [25]:
#mean
cars.MPG.mean()

23.051231527093602

In [26]:
#median
cars.MPG.median()

22.35

In [27]:
#standard deviation
cars.MPG.std()

8.40177735227059

In [28]:
#variance
cars.MPG.var()

70.58986267712702

In [29]:
#minimum
cars.MPG.min()

0.0

In [30]:
#Maximum
cars.MPG.max()

46.600000000000001

In [31]:
#25th quantile
cars.MPG.quantile(0.25)

17.0

In [32]:
#90th quantile
cars.MPG.quantile(0.9)

34.25

In [33]:
#90th, 95th and 99th percentile
cars.MPG.quantile([0.9,0.95,0.99])

0.90    34.250
0.95    37.000
0.99    43.385
Name: MPG, dtype: float64

In [34]:
#Inter-quartile range
cars.MPG.quantile(0.75)-cars.MPG.quantile(0.25)

12.0

In [35]:
#OR create a function
def IQR(variable):
    return variable.quantile(0.75) - variable.quantile(0.25) 

In [36]:
IQR(cars.MPG)

12.0

In [37]:
#describe
cars.MPG.describe()

count    406.000000
mean      23.051232
std        8.401777
min        0.000000
25%       17.000000
50%       22.350000
75%       29.000000
max       46.600000
Name: MPG, dtype: float64

### Some Functions and their Descriptions

    Function	Description
---------------------------------------------
       count	Number of non-null observations
         sum	Sum of values
        mean	Mean of values
         mad	Mean absolute deviation
      median	Arithmetic median of values
         min	Minimum
         max	Maximum
        mode	Mode
         abs	Absolute Value
        prod	Product of values
         std	Unbiased standard deviation
         var	Unbiased variance
         sem	Unbiased standard error of the mean
        skew	Unbiased skewness (3rd moment)
        kurt	Unbiased kurtosis (4th moment)
    quantile	Sample quantile (value at %)
      cumsum	Cumulative sum
     cumprod	Cumulative product
      cummax	Cumulative maximum
      cummin	Cumulative minimum