## Introduction to Pandas

#### 1. Reading a csv file

In [1]:
# Check the current working directory
import os
os.getcwd() #get the current working directory

'C:\\Users\\Gourab\\Desktop\\Python'

In [2]:
# Set the working directory
# C:\\Users\\Gourab\\Desktop\\R
os.chdir(r"C:\Users\Gourab\Desktop\R") 

In [3]:
#Read the cars.csv data
import pandas as pd
cars = pd.read_csv("cars.csv")

In [None]:
import pandas as pd
cars = pd.read_csv("C:\\Users\\Gourab\\Desktop\\R\\cars.csv")

In [4]:
type(cars)

pandas.core.frame.DataFrame

#### 2. Some initial steps with data

In [5]:
#A. Check the dimension of the data set
cars.shape

(406, 9)

In [6]:
#B. Number of rows in the data set
len(cars)

406

In [7]:
#OR
cars.shape[0]

406

In [8]:
#C. Number of columns in the data set
cars.shape[1]

9

In [9]:
cars.columns #Get the columns

Index(['Car', 'MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
       'Acceleration', 'Model', 'Origin'],
      dtype='object')

In [10]:
#OR
list(cars)  #This will return a list

['Car',
 'MPG',
 'Cylinders',
 'Displacement',
 'Horsepower',
 'Weight',
 'Acceleration',
 'Model',
 'Origin']

In [None]:
len(cars.columns)

In [11]:
#Aside
cars.index #Get the row index

RangeIndex(start=0, stop=406, step=1)

In [12]:
#Studying the variable types
cars.dtypes

Car              object
MPG             float64
Cylinders         int64
Displacement    float64
Horsepower        int64
Weight            int64
Acceleration    float64
Model             int64
Origin           object
dtype: object

In [None]:
#D. Getting the variable names
cars.columns

In [13]:
#E. Printing the first 5 lines of the data set
cars.head()

Unnamed: 0,Car,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model,Origin
0,Chevrolet Chevelle Malibu,18.0,8,307.0,130,3504,12.0,70,US
1,Buick Skylark 320,15.0,8,350.0,165,3693,11.5,70,US
2,Plymouth Satellite,18.0,8,318.0,150,3436,11.0,70,US
3,AMC Rebel SST,16.0,8,304.0,150,3433,12.0,70,US
4,Ford Torino,17.0,8,302.0,140,3449,10.5,70,US


In [None]:
#F. Printing the first 10 lines of the data set
cars.head(10)

In [None]:
#G. Printing the last 3 lines of the data set
cars.tail(3)

In [None]:
#H. Printing the entire data
cars

In [None]:
#Print a random sample
cars.sample(5)

In [None]:
#head
#tail
#sample

#### 3. Choosing a single variable

In [14]:
#Method1 - Use Square brackets
cars["MPG"].head()

0    18.0
1    15.0
2    18.0
3    16.0
4    17.0
Name: MPG, dtype: float64

In [None]:
cars["Acceleration"].head()

In [None]:
cars.Origin

In [None]:
cars.Acceleration.head(2)

In [None]:
#Method2 - Use dot (.)
cars.MPG.head()

In [None]:
cars.Cylinders.head()

#### 4. Some Basic Statistical Functions

In [None]:
type(cars.MPG)

In [None]:
#sum
cars.MPG.sum()

In [None]:
#mean
cars.MPG.mean()

In [None]:
#median
cars.MPG.median()

In [None]:
#standard deviation
cars.MPG.std()

In [None]:
#variance
cars.MPG.var()

In [None]:
#minimum
cars.MPG.min()

In [None]:
#Maximum
cars.MPG.max()

In [15]:
#25th quantile
cars.MPG.quantile(.25)

17.0

In [16]:
#90th quantile
cars.MPG.quantile(0.9)

34.25

In [17]:
#90th, 95th and 99th percentile
cars.MPG.quantile([0.90,0.95,0.99])

0.90    34.250
0.95    37.000
0.99    43.385
Name: MPG, dtype: float64

In [20]:
#Inter-quartile range
cars.MPG.quantile(0.75) - cars.MPG.quantile(0.25)

12.0

In [18]:
#OR create a function
def IQR(x):
    return(x.quantile(0.75) - x.quantile(0.25))

In [19]:
IQR(cars.MPG)

12.0

In [None]:
IQR(cars.Weight)

In [21]:
#describe
cars.MPG.describe()

count    406.000000
mean      23.051232
std        8.401777
min        0.000000
25%       17.000000
50%       22.350000
75%       29.000000
max       46.600000
Name: MPG, dtype: float64

In [None]:
cars.describe()

### Some Functions and their Descriptions

    Function	Description
---------------------------------------------
       count	Number of non-null observations
         sum	Sum of values
        mean	Mean of values
         mad	Mean absolute deviation
      median	Arithmetic median of values
         min	Minimum
         max	Maximum
        mode	Mode
         abs	Absolute Value
        prod	Product of values
         std	Unbiased standard deviation
         var	Unbiased variance
         sem	Unbiased standard error of the mean
        skew	Unbiased skewness (3rd moment)
        kurt	Unbiased kurtosis (4th moment)
    quantile	Sample quantile (value at %)
      cumsum	Cumulative sum
     cumprod	Cumulative product
      cummax	Cumulative maximum
      cummin	Cumulative minimum