In [165]:
import pandas as pd

### Series
* Series is index based, column data structure which represents 1-D array.
* We can use pd.Series() method to create pandas Series object.

In [151]:
# Create Series:
# 1. From list
odd = [1,3,5,7,9]
odd_series = pd.Series(odd)
print(odd_series)

# 2. From dictionary
even_series = pd.Series({ "zero": 0, "two": 2, "four": 4, "six": 6, "eight": 8 })
even_series

0    1
1    3
2    5
3    7
4    9
dtype: int64


zero     0
two      2
four     4
six      6
eight    8
dtype: int64

In [157]:
# Creating series by providing custom index names
person_series = pd.Series(['Joy', 28, 'San Francisco'], 
                          index=['Name', 'Age', 'Address'])
person_series

Name                 Joy
Age                   28
Address    San Francisco
dtype: object

In [188]:
## Access Series data
# 1. index based access
print(odd_series[3])

# 2. column based access:
print(even_series['six'])
print(even_series.six)

7
6
6


### DataFrames
* DataFrame represents tabular data and consists of one of more Series.
* We can use pd.DataFrame() method to create pandas DataFrame object.

In [166]:
# Create DataFrame
# 1. From Dictionary
odd_even = {"odd": [1,3,5,7,9], "even": [0,2,4,6,8]}
odd_even_df = pd.DataFrame(odd_even)
print(odd_even_df)

# 2. From csv file
odd_even_df = pd.read_csv('odd-even.csv')
odd_even_df

   odd  even
0    1     0
1    3     2
2    5     4
3    7     6
4    9     8


Unnamed: 0,Odd,Even
0,1,0
1,3,2
2,5,4
3,7,6
4,9,8


In [221]:
# Creating DataFrame by providing custom index names
person = {"0": ['Joy', 28, 'San Francisco'], "1": ['Tom', 32, 'New York'] }
person_df = pd.DataFrame(person, 
                        index=['Name', 'Age', 'Address'])
print(person_df)

# Lets try from csv file.
person_df = pd.read_csv('person.csv', index_col=0)
person_df

                     0         1
Name               Joy       Tom
Age                 28        32
Address  San Francisco  New York


Unnamed: 0,0,1
Name,Joy,Tom
Age,28,32
Address,San Francisco,New York


In [240]:
# DataFrame slicing, selecting, extracting
employee_df = pd.read_csv('data.csv')

#1. By Columns
gender = employee_df['Gender']
print(gender)
name_gender = employee_df[['Name', 'Gender']]
name_gender

0    M
1    M
2    F
3    F
4    F
Name: Gender, dtype: object


Unnamed: 0,Name,Gender
0,Joy,M
1,Tom,M
2,Mary,F
3,Claudia,F
4,Rita,F


In [241]:

employee_df.set_index('Gender', inplace=True)

In [246]:
#2. By Rows
gender = employee_df.loc['F']
print(gender)
gender = employee_df.iloc[1]
print(gender)
subset = employee_df.iloc[2:4]
subset

           Name  Age   Address
Gender                        
F          Mary   32  New York
F       Claudia   27    London
F          Rita   25     India
Name            Tom
Age              32
Address    New York
Name: M, dtype: object


Unnamed: 0_level_0,Name,Age,Address
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,Mary,32,New York
F,Claudia,27,London


### Exploratory Data Analysis

In [191]:
## Lets use some real data to do EDA using Pandas operations.
movies_df = pd.read_csv('imdb-movies.csv')

In [197]:
#1. View Data
movies_df.head(5)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


In [198]:
#2. Get information about your data.
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                1000 non-null   int64  
 1   Title               1000 non-null   object 
 2   Genre               1000 non-null   object 
 3   Description         1000 non-null   object 
 4   Director            1000 non-null   object 
 5   Actors              1000 non-null   object 
 6   Year                1000 non-null   int64  
 7   Runtime (Minutes)   1000 non-null   int64  
 8   Rating              1000 non-null   float64
 9   Votes               1000 non-null   int64  
 10  Revenue (Millions)  872 non-null    float64
 11  Metascore           936 non-null    float64
dtypes: float64(3), int64(4), object(5)
memory usage: 93.9+ KB


In [200]:
#3. DataFrame shape
movies_df.shape

(1000, 12)

In [203]:
#4. Handling Duplicates
temp_df = movies_df.append(movies_df)
print("Shape: ", temp_df.shape)
temp_df.drop_duplicates(inplace=True)
temp_df.shape

Shape:  (2000, 12)


(1000, 12)

In [207]:
#5. Handling Mising Data
# First check if there are any missing values in DF
movies_df.isnull().sum()

Rank                    0
Title                   0
Genre                   0
Description             0
Director                0
Actors                  0
Year                    0
Runtime (Minutes)       0
Rating                  0
Votes                   0
Revenue (Millions)    128
Metascore              64
dtype: int64

In [None]:
# Remove all the rows where columns have any missing values.
movies_df.dropna()

In [None]:
# Instead of Removing the rows, we can drop the columns with missing values.

In [212]:
#6. Imputation 
revenue = movies_df['Revenue (Millions)']
revenue

0      333.13
1      126.46
2      138.12
3      270.32
4      325.02
        ...  
995       NaN
996     17.54
997     58.01
998       NaN
999     19.64
Name: Revenue (Millions), Length: 1000, dtype: float64

In [217]:
# Impute using means
mean = revenue.mean()
print('Revenue Mean', mean)
revenue.fillna(mean, inplace=True)
print('New Revenue Mean', revenue.mean())

Revenue Mean 82.95637614678898
New Revenue Mean 82.95637614678898


In [218]:
movies_df.isnull().sum()

Rank                   0
Title                  0
Genre                  0
Description            0
Director               0
Actors                 0
Year                   0
Runtime (Minutes)      0
Rating                 0
Votes                  0
Revenue (Millions)     0
Metascore             64
dtype: int64

In [219]:
#7. Understanding your variables
movies_df.describe()

Unnamed: 0,Rank,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,936.0
mean,500.5,2012.783,113.172,6.7232,169808.3,82.956376,58.985043
std,288.819436,3.205962,18.810908,0.945429,188762.6,96.412043,17.194757
min,1.0,2006.0,66.0,1.9,61.0,0.0,11.0
25%,250.75,2010.0,100.0,6.2,36309.0,17.4425,47.0
50%,500.5,2014.0,111.0,6.8,110799.0,60.375,59.5
75%,750.25,2016.0,123.0,7.4,239909.8,99.1775,72.0
max,1000.0,2016.0,191.0,9.0,1791916.0,936.63,100.0


In [252]:
# we can also descirbe one series
movies_df['Rating'].describe()

count    1000.000000
mean        6.723200
std         0.945429
min         1.900000
25%         6.200000
50%         6.800000
75%         7.400000
max         9.000000
Name: Rating, dtype: float64

In [247]:
#7. Apply Functions
def rating_function(x):
    if x >= 8.0:
        return "good"
    else:
        return "bad"

In [249]:
movies_df["rating_category"] = movies_df["Rating"].apply(rating_function)

In [250]:
movies_df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,rating_category
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,good
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,bad
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,bad
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,bad
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,bad


In [39]:
# Note that Pandas DataFrames are accessed primarily by columns. In a sense the row is less important to a DataFrame.
# Following code will throw an error. Because the columns cannot be accessed by their index number in this way, 
# we must use the column name.
dataframe[0][0]

KeyError: 0

In [34]:
# So to access rows we can use following methods
dataframe.iloc[0,0]

'Ben Nevis'

In [40]:
# APPEND TO EXISTING DATAFRAME
dataframe['Region'] = ['Grampian', 'Cairngorm', 'Cairngorm', 'Cairngorm', 'Cairngorm']
dataframe

Unnamed: 0,Hill Name,Height,Latitude,Longitude,Region
0,Ben Nevis,1345,56.79685,-5.003508,Grampian
1,Ben Macdui,1309,57.070453,-3.668262,Cairngorm
2,Braeriach,1296,57.078628,-3.728024,Cairngorm
3,Cairn Toul,1291,57.054611,-3.71042,Cairngorm
4,Sgòr an Lochain Uaine,1258,57.057999,-3.725416,Cairngorm


In [80]:
# Creating DataFrame
dataframe = pd.DataFrame([1,2,3,4,5])

dataframe = pd.DataFrame([
                [1,2,3,4,5],
                [11,12,13,14,15]
            ],
    columns=['1st', '2nd', '3rd', '4th', '5th']
)


Unnamed: 0,one,two
0,1,2
1,11,2
2,111,2


In [257]:
dataframe = pd.DataFrame({
    'one': [1, 11],
    'two': [2, 22]
})

dataframe

Unnamed: 0,one,two
0,1,2
1,11,22


In [259]:
col1 = dataframe['one']
col1

0     1
1    11
Name: one, dtype: int64

In [93]:
print(dataframe.iloc[:,0]) # print Row 0
print(dataframe.loc[:,'one'])

0      1
1     11
2    111
Name: one, dtype: int64
0      1
1     11
2    111
Name: one, dtype: int64
