# Data Analysis Process

1. Asking Questions
2. Data Wrangling<br>
    a. Gathering Data
        - i. CSV files
        - ii. APIs
        - iii. Web Scraping
        - iv. Databases<br>
    b. Assessing Data<br>
    c. Cleaning Data
3. Exploratory Data Analysis
4. Drawing Conclusion
5. Comunicating Results

# This notebook is about Gathering Data

#  -------------------------Import data-------------------------

# Import from csv
##### Documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

### 1. Importing pandas

In [2]:
import pandas as pd

### 2. Opening a local csv file

In [3]:
df = pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv')
df

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


### 3. Opening a csv file from an URL

In [4]:
### code snippet ###

import requests
from io import StringIO

url = "https://raw.githubusercontent.com/cs109/2014_data/master/countries.csv"
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.14; rv:66.0) Gecko/20100101 Firefox/66.0"}
req = requests.get(url, headers=headers)
data = StringIO(req.text)

pd.read_csv(data)

Unnamed: 0,Country,Region
0,Algeria,AFRICA
1,Angola,AFRICA
2,Benin,AFRICA
3,Botswana,AFRICA
4,Burkina,AFRICA
...,...,...
189,Paraguay,SOUTH AMERICA
190,Peru,SOUTH AMERICA
191,Suriname,SOUTH AMERICA
192,Uruguay,SOUTH AMERICA


### 4. Sep Parameter

In [5]:
### read tsv file by read_csv using 'sep' parameter

pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\movie_titles_metadata.tsv',sep='\t') # '\t' means tab
## successfully load the tsv file but there are a problem because there are no column name so it consider first items as columns

Unnamed: 0,m0,10 things i hate about you,1999,6.90,62847,['comedy' 'romance']
0,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
2,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
3,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
4,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...
...,...,...,...,...,...,...
611,m612,watchmen,2009,7.8,135229.0,['action' 'crime' 'fantasy' 'mystery' 'sci-fi'...
612,m613,xxx,2002,5.6,53505.0,['action' 'adventure' 'crime']
613,m614,x-men,2000,7.4,122149.0,['action' 'sci-fi']
614,m615,young frankenstein,1974,8.0,57618.0,['comedy' 'sci-fi']


In [6]:
### to get rid from this problem, we will add column names to the dataset using 'names' parameter
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\movie_titles_metadata.tsv',sep='\t',names=['sno','name','release_year','rating','votes','genres'])

Unnamed: 0,sno,name,release_year,rating,votes,genres
0,m0,10 things i hate about you,1999,6.9,62847.0,['comedy' 'romance']
1,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
2,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
3,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
4,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
...,...,...,...,...,...,...
612,m612,watchmen,2009,7.8,135229.0,['action' 'crime' 'fantasy' 'mystery' 'sci-fi'...
613,m613,xxx,2002,5.6,53505.0,['action' 'adventure' 'crime']
614,m614,x-men,2000,7.4,122149.0,['action' 'sci-fi']
615,m615,young frankenstein,1974,8.0,57618.0,['comedy' 'sci-fi']


### 5. Index_col parameter

In [7]:
### to set a column as an index and remove the default index
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',index_col='enrollee_id')

Unnamed: 0_level_0,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
enrollee_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


### 6. Header parameter

In [8]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\test.csv')

## there are a problem, to convert the column names from a specific row, we will use 'header' parameter 

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
1,1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0
3,3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
4,4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


In [9]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\test.csv',header=1)
## here header=1 means 1st row we want to convert as columns

Unnamed: 0,0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
1,2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0
2,3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
3,4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


### 7. usecols parameter

In [10]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv').head(3)

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0


In [11]:
### to get specified columns from a dataset
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',usecols=['enrollee_id','gender','education_level'])

Unnamed: 0,enrollee_id,gender,education_level
0,8949,Male,Graduate
1,29725,Male,Graduate
2,11561,,Graduate
3,33241,,Graduate
4,666,Male,Masters
...,...,...,...
19153,7386,Male,Graduate
19154,31398,Male,Graduate
19155,24576,Male,Graduate
19156,5756,Male,High School


### 8. Squeeze function

In [12]:
## to make a column as a series
df= pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',usecols=['gender'])
df.squeeze()

0        Male
1        Male
2         NaN
3         NaN
4        Male
         ... 
19153    Male
19154    Male
19155    Male
19156    Male
19157     NaN
Name: gender, Length: 19158, dtype: object

### 9. skiprows/nrows Parameter

In [13]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv')

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [14]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv', skiprows=[0,1]) # skip the 1st and 2nd rows

Unnamed: 0,29725,city_40,0.7759999999999999,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
0,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
1,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
2,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
3,21651,city_176,0.764,,Has relevent experience,Part time course,Graduate,STEM,11,,,1,24,1.0
4,28806,city_160,0.920,Male,Has relevent experience,no_enrollment,High School,,5,50-99,Funded Startup,1,24,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19151,7386,city_173,0.878,Male,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19152,31398,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19153,24576,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19154,5756,city_65,0.802,Male,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


In [15]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',nrows=100) 
## to restrict the number of rows , here we imported only first 100 rows

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,12081,city_65,0.802,Male,Has relevent experience,Full time course,Graduate,STEM,9,50-99,Pvt Ltd,1,33,0.0
96,7364,city_160,0.920,,No relevent experience,Full time course,High School,,2,100-500,Pvt Ltd,1,142,0.0
97,11184,city_74,0.579,,No relevent experience,Full time course,Graduate,STEM,2,100-500,Pvt Ltd,1,34,0.0
98,7016,city_65,0.802,Male,Has relevent experience,no_enrollment,Graduate,STEM,6,50-99,Pvt Ltd,2,14,1.0


### 10. Encoding parameter

In [16]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\zomato.csv')
## error because the encoding is not UTF-8

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xed in position 7044: invalid continuation byte

In [None]:
## default encoding should be UTF-8
## but when there are another type of encoding in dataset , we use encoding parameter
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\zomato.csv',encoding='latin-1') # here, the encoding format for the dataset is latin-1

Unnamed: 0,Restaurant ID,Restaurant Name,Country Code,City,Address,Locality,Locality Verbose,Longitude,Latitude,Cuisines,...,Currency,Has Table booking,Has Online delivery,Is delivering now,Switch to order menu,Price range,Aggregate rating,Rating color,Rating text,Votes
0,6317637,Le Petit Souffle,162,Makati City,"Third Floor, Century City Mall, Kalayaan Avenu...","Century City Mall, Poblacion, Makati City","Century City Mall, Poblacion, Makati City, Mak...",121.027535,14.565443,"French, Japanese, Desserts",...,Botswana Pula(P),Yes,No,No,No,3,4.8,Dark Green,Excellent,314
1,6304287,Izakaya Kikufuji,162,Makati City,"Little Tokyo, 2277 Chino Roces Avenue, Legaspi...","Little Tokyo, Legaspi Village, Makati City","Little Tokyo, Legaspi Village, Makati City, Ma...",121.014101,14.553708,Japanese,...,Botswana Pula(P),Yes,No,No,No,3,4.5,Dark Green,Excellent,591
2,6300002,Heat - Edsa Shangri-La,162,Mandaluyong City,"Edsa Shangri-La, 1 Garden Way, Ortigas, Mandal...","Edsa Shangri-La, Ortigas, Mandaluyong City","Edsa Shangri-La, Ortigas, Mandaluyong City, Ma...",121.056831,14.581404,"Seafood, Asian, Filipino, Indian",...,Botswana Pula(P),Yes,No,No,No,4,4.4,Green,Very Good,270
3,6318506,Ooma,162,Mandaluyong City,"Third Floor, Mega Fashion Hall, SM Megamall, O...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.056475,14.585318,"Japanese, Sushi",...,Botswana Pula(P),No,No,No,No,4,4.9,Dark Green,Excellent,365
4,6314302,Sambo Kojin,162,Mandaluyong City,"Third Floor, Mega Atrium, SM Megamall, Ortigas...","SM Megamall, Ortigas, Mandaluyong City","SM Megamall, Ortigas, Mandaluyong City, Mandal...",121.057508,14.584450,"Japanese, Korean",...,Botswana Pula(P),Yes,No,No,No,4,4.8,Dark Green,Excellent,229
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9546,5915730,NamlÛ± Gurme,208,ÛÁstanbul,"Kemankeô Karamustafa Paôa Mahallesi, RÛ±htÛ±...",Karakí_y,"Karakí_y, ÛÁstanbul",28.977392,41.022793,Turkish,...,Turkish Lira(TL),No,No,No,No,3,4.1,Green,Very Good,788
9547,5908749,Ceviz AÛôacÛ±,208,ÛÁstanbul,"Koôuyolu Mahallesi, Muhittin íìstí_ndaÛô Cadd...",Koôuyolu,"Koôuyolu, ÛÁstanbul",29.041297,41.009847,"World Cuisine, Patisserie, Cafe",...,Turkish Lira(TL),No,No,No,No,3,4.2,Green,Very Good,1034
9548,5915807,Huqqa,208,ÛÁstanbul,"Kuruí_eôme Mahallesi, Muallim Naci Caddesi, N...",Kuruí_eôme,"Kuruí_eôme, ÛÁstanbul",29.034640,41.055817,"Italian, World Cuisine",...,Turkish Lira(TL),No,No,No,No,4,3.7,Yellow,Good,661
9549,5916112,Aôôk Kahve,208,ÛÁstanbul,"Kuruí_eôme Mahallesi, Muallim Naci Caddesi, N...",Kuruí_eôme,"Kuruí_eôme, ÛÁstanbul",29.036019,41.057979,Restaurant Cafe,...,Turkish Lira(TL),No,No,No,No,4,4.0,Green,Very Good,901


### 11. Skip bad lines
#### on_bad_lines= 'skip'

In [None]:
## suppose in a dataset, there are 8 columns in general but one of the row has extra 1 column. so it will show error
## to get rid from the error and skip the extra column we use the parameter

pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\test.csv', on_bad_lines='skip')

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14
0,0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
1,1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0
3,3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
4,4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0


### 12. dtypes parameter

In [None]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [None]:
### to change the datatype of a column
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',dtype={'target':int}).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  int64  
dtypes: float64(1), int64(3), object(10)
me

### 13. Handling Dates
#### parse_dates

In [None]:
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\IPL Matches 2008-2020.csv').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   id               816 non-null    int64  
 1   city             803 non-null    object 
 2   date             816 non-null    object 
 3   player_of_match  812 non-null    object 
 4   venue            816 non-null    object 
 5   neutral_venue    816 non-null    int64  
 6   team1            816 non-null    object 
 7   team2            816 non-null    object 
 8   toss_winner      816 non-null    object 
 9   toss_decision    816 non-null    object 
 10  winner           812 non-null    object 
 11  result           812 non-null    object 
 12  result_margin    799 non-null    float64
 13  eliminator       812 non-null    object 
 14  method           19 non-null     object 
 15  umpire1          816 non-null    object 
 16  umpire2          816 non-null    object 
dtypes: float64(1), i

In [None]:
## to chage string to date-time format
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\IPL Matches 2008-2020.csv',parse_dates=['date']).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816 entries, 0 to 815
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   id               816 non-null    int64         
 1   city             803 non-null    object        
 2   date             816 non-null    datetime64[ns]
 3   player_of_match  812 non-null    object        
 4   venue            816 non-null    object        
 5   neutral_venue    816 non-null    int64         
 6   team1            816 non-null    object        
 7   team2            816 non-null    object        
 8   toss_winner      816 non-null    object        
 9   toss_decision    816 non-null    object        
 10  winner           812 non-null    object        
 11  result           812 non-null    object        
 12  result_margin    799 non-null    float64       
 13  eliminator       812 non-null    object        
 14  method           19 non-null     object   

### 14. Converters

In [None]:
def rename(name):
    if name == "Royal Challengers Bangalore":
        return "RCB"
    else:
        return name

In [None]:
rename("Royal Challengers Bangalore")

'RCB'

In [None]:
## applying a function on a specific column
pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\IPL Matches 2008-2020.csv',converters={'team1':rename})

Unnamed: 0,id,city,date,player_of_match,venue,neutral_venue,team1,team2,toss_winner,toss_decision,winner,result,result_margin,eliminator,method,umpire1,umpire2
0,335982,Bangalore,2008-04-18,BB McCullum,M Chinnaswamy Stadium,0,RCB,Kolkata Knight Riders,Royal Challengers Bangalore,field,Kolkata Knight Riders,runs,140.0,N,,Asad Rauf,RE Koertzen
1,335983,Chandigarh,2008-04-19,MEK Hussey,"Punjab Cricket Association Stadium, Mohali",0,Kings XI Punjab,Chennai Super Kings,Chennai Super Kings,bat,Chennai Super Kings,runs,33.0,N,,MR Benson,SL Shastri
2,335984,Delhi,2008-04-19,MF Maharoof,Feroz Shah Kotla,0,Delhi Daredevils,Rajasthan Royals,Rajasthan Royals,bat,Delhi Daredevils,wickets,9.0,N,,Aleem Dar,GA Pratapkumar
3,335985,Mumbai,2008-04-20,MV Boucher,Wankhede Stadium,0,Mumbai Indians,Royal Challengers Bangalore,Mumbai Indians,bat,Royal Challengers Bangalore,wickets,5.0,N,,SJ Davis,DJ Harper
4,335986,Kolkata,2008-04-20,DJ Hussey,Eden Gardens,0,Kolkata Knight Riders,Deccan Chargers,Deccan Chargers,bat,Kolkata Knight Riders,wickets,5.0,N,,BF Bowden,K Hariharan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
811,1216547,Dubai,2020-09-28,AB de Villiers,Dubai International Cricket Stadium,0,RCB,Mumbai Indians,Mumbai Indians,field,Royal Challengers Bangalore,tie,,Y,,Nitin Menon,PR Reiffel
812,1237177,Dubai,2020-11-05,JJ Bumrah,Dubai International Cricket Stadium,0,Mumbai Indians,Delhi Capitals,Delhi Capitals,field,Mumbai Indians,runs,57.0,N,,CB Gaffaney,Nitin Menon
813,1237178,Abu Dhabi,2020-11-06,KS Williamson,Sheikh Zayed Stadium,0,RCB,Sunrisers Hyderabad,Sunrisers Hyderabad,field,Sunrisers Hyderabad,wickets,6.0,N,,PR Reiffel,S Ravi
814,1237180,Abu Dhabi,2020-11-08,MP Stoinis,Sheikh Zayed Stadium,0,Delhi Capitals,Sunrisers Hyderabad,Delhi Capitals,bat,Delhi Capitals,runs,17.0,N,,PR Reiffel,S Ravi


### 15. na_values parameter

In [None]:
### to specify which value we consider as nan value

pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',na_values=['Male']) # here we consider Male as nan value

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.920,,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19153,7386,city_173,0.878,,No relevent experience,no_enrollment,Graduate,Humanities,14,,,1,42,1.0
19154,31398,city_103,0.920,,Has relevent experience,no_enrollment,Graduate,STEM,14,,,4,52,1.0
19155,24576,city_103,0.920,,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,4,44,0.0
19156,5756,city_65,0.802,,Has relevent experience,no_enrollment,High School,,<1,500-999,Pvt Ltd,2,97,0.0


### 16. Loading a huge dataset in chunks

In [None]:
dfs = pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\aug_train.csv',chunksize=5000) # here we created multiple chunks each contains maximum 5000 rows 
dfs

<pandas.io.parsers.readers.TextFileReader at 0x1f8be9eac10>

In [None]:
for chunks in dfs:
    print(chunks.shape)

(5000, 14)
(5000, 14)
(5000, 14)
(4158, 14)


# Import from excel - (xlsx)
##### Documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html
##### Parameters are quit similar to read_csv

In [None]:
pd.read_excel(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\outputl.xlsx')
## it automatically read first sheet

Unnamed: 0.1,Unnamed: 0,batter,runs
0,0,A Ashish Reddy,280
1,1,A Badoni,161
2,2,A Chandila,4
3,3,A Chopra,53
4,4,A Choudhary,25
...,...,...,...
600,600,Yash Dayal,0
601,601,Yashpal Singh,47
602,602,Younis Khan,3
603,603,Yuvraj Singh,2754


In [None]:
pd.read_excel(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\outputl.xlsx', index_col='Unnamed: 0') # set that column as index

Unnamed: 0,batter,runs
0,A Ashish Reddy,280
1,A Badoni,161
2,A Chandila,4
3,A Chopra,53
4,A Choudhary,25
...,...,...
600,Yash Dayal,0
601,Yashpal Singh,47
602,Younis Khan,3
603,Yuvraj Singh,2754


In [None]:
## to read second sheet
pd.read_excel(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\outputl.xlsx', sheet_name= 'Sheet2')

Unnamed: 0,id,season,city,date,team1,team2,toss_winner,toss_decision,result,dl_applied,winner,win_by_runs,win_by_wickets,player_of_match,venue,umpire1,umpire2,umpire3
0,1,2017,Hyderabad,2017-04-05,Sunrisers Hyderabad,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Sunrisers Hyderabad,35,0,Yuvraj Singh,"Rajiv Gandhi International Stadium, Uppal",AY Dandekar,NJ Llong,
1,2,2017,Pune,2017-04-06,Mumbai Indians,Rising Pune Supergiant,Rising Pune Supergiant,field,normal,0,Rising Pune Supergiant,0,7,SPD Smith,Maharashtra Cricket Association Stadium,A Nand Kishore,S Ravi,
2,3,2017,Rajkot,2017-04-07,Gujarat Lions,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Kolkata Knight Riders,0,10,CA Lynn,Saurashtra Cricket Association Stadium,Nitin Menon,CK Nandan,
3,4,2017,Indore,2017-04-08,Rising Pune Supergiant,Kings XI Punjab,Kings XI Punjab,field,normal,0,Kings XI Punjab,0,6,GJ Maxwell,Holkar Cricket Stadium,AK Chaudhary,C Shamshuddin,
4,5,2017,Bangalore,2017-04-08,Royal Challengers Bangalore,Delhi Daredevils,Royal Challengers Bangalore,bat,normal,0,Royal Challengers Bangalore,15,0,KM Jadhav,M Chinnaswamy Stadium,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
631,632,2016,Raipur,2016-05-22,Delhi Daredevils,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Royal Challengers Bangalore,0,6,V Kohli,Shaheed Veer Narayan Singh International Stadium,A Nand Kishore,BNJ Oxenford,
632,633,2016,Bangalore,2016-05-24,Gujarat Lions,Royal Challengers Bangalore,Royal Challengers Bangalore,field,normal,0,Royal Challengers Bangalore,0,4,AB de Villiers,M Chinnaswamy Stadium,AK Chaudhary,HDPK Dharmasena,
633,634,2016,Delhi,2016-05-25,Sunrisers Hyderabad,Kolkata Knight Riders,Kolkata Knight Riders,field,normal,0,Sunrisers Hyderabad,22,0,MC Henriques,Feroz Shah Kotla,M Erasmus,C Shamshuddin,
634,635,2016,Delhi,2016-05-27,Gujarat Lions,Sunrisers Hyderabad,Sunrisers Hyderabad,field,normal,0,Sunrisers Hyderabad,0,4,DA Warner,Feroz Shah Kotla,M Erasmus,CK Nandan,


# Import from tsv
#### Here, we use read_csv and change the 'sep=' parameter

In [None]:
### read tsv file by read_csv using 'sep' parameter

pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\movie_titles_metadata.tsv',sep='\t') # '\t' means tab
## successfully load the tsv file but there are a problem because there are no column name so it consider first items as columns

Unnamed: 0,m0,10 things i hate about you,1999,6.90,62847,['comedy' 'romance']
0,m1,1492: conquest of paradise,1992,6.2,10421.0,['adventure' 'biography' 'drama' 'history']
1,m2,15 minutes,2001,6.1,25854.0,['action' 'crime' 'drama' 'thriller']
2,m3,2001: a space odyssey,1968,8.4,163227.0,['adventure' 'mystery' 'sci-fi']
3,m4,48 hrs.,1982,6.9,22289.0,['action' 'comedy' 'crime' 'drama' 'thriller']
4,m5,the fifth element,1997,7.5,133756.0,['action' 'adventure' 'romance' 'sci-fi' 'thri...
...,...,...,...,...,...,...
611,m612,watchmen,2009,7.8,135229.0,['action' 'crime' 'fantasy' 'mystery' 'sci-fi'...
612,m613,xxx,2002,5.6,53505.0,['action' 'adventure' 'crime']
613,m614,x-men,2000,7.4,122149.0,['action' 'sci-fi']
614,m615,young frankenstein,1974,8.0,57618.0,['comedy' 'sci-fi']


# Import from txt
#### Here, we use read_csv and change the 'sep=' parameter

In [None]:
### read txt file by read_csv using 'sep' parameter

pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\sale.txt',sep='\t') # '\t' means tab, 
## here, we use '\t' because in this txt file, the values are tab separated

Unnamed: 0,Customer_ID,Name,Age,Country,Product,Purchase_Amount
0,101,John Doe,25,USA,Laptop,1200
1,102,Jane Smith,30,Canada,Smartphone,800
2,103,Ali Khan,28,UAE,Headphones,150
3,104,Sophia Lee,35,UK,Tablet,400
4,105,Maria Gonzalez,22,Mexico,Smartwatch,250
5,106,David Brown,40,Australia,Monitor,300
6,107,Li Wei,29,China,Keyboard,100
7,108,Aisha Yusuf,31,Nigeria,Mouse,50
8,109,Hans Müller,33,Germany,Printer,200
9,110,Emma Dubois,27,France,Charger,30


# Import from json
##### Documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_json.html

In [None]:
pd.read_json(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\employee.json')

Unnamed: 0,Employee_ID,Name,Age,Department,Salary,Remote_Work
0,201,Alice Johnson,29,Finance,55000,True
1,202,Bob Smith,35,Engineering,78000,False
2,203,Charlie Evans,41,Marketing,62000,True
3,204,Diana Lee,27,Human Resources,50000,False
4,205,Ethan Brown,32,IT Support,58000,True


#### Import json data from url

In [None]:
pd.read_json('https://api.exchangerate-api.com/v4/latest/INR')

Unnamed: 0,provider,WARNING_UPGRADE_TO_V6,terms,base,date,time_last_updated,rates
INR,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,1.000
AED,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,0.043
AFN,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,0.826
ALL,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,1.070
AMD,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,4.580
...,...,...,...,...,...,...,...
XPF,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,1.290
YER,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,2.880
ZAR,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,0.215
ZMW,https://www.exchangerate-api.com,https://www.exchangerate-api.com/docs/free,https://www.exchangerate-api.com/terms,INR,2025-04-01,1743465601,0.330


# Import from SQL
##### Documentation: https://pandas.pydata.org/docs/reference/api/pandas.read_sql_query.html

In [None]:
import mysql.connector #used to connect to and interact with MySQL databases

In [None]:
conn = mysql.connector.connect(host='localhost',user='root',password='',database='world') # 4 things

In [None]:
pd.read_sql_query('SELECT * FROM city', conn) # 2 things

  pd.read_sql_query('SELECT * FROM city', conn) # 2 things


Unnamed: 0,ID,Name,CountryCode,District,Population
0,1,Kabul,AFG,Kabol,1780000
1,2,Qandahar,AFG,Qandahar,237500
2,3,Herat,AFG,Herat,186800
3,4,Mazar-e-Sharif,AFG,Balkh,127800
4,5,Amsterdam,NLD,Noord-Holland,731200
...,...,...,...,...,...
4074,4075,Khan Yunis,PSE,Khan Yunis,123175
4075,4076,Hebron,PSE,Hebron,119401
4076,4077,Jabaliya,PSE,North Gaza,113901
4077,4078,Nablus,PSE,Nablus,100231


In [None]:
pd.read_sql_query("SELECT * FROM city WHERE CountryCode LIKE 'BGD' ", conn)

  pd.read_sql_query("SELECT * FROM city WHERE CountryCode LIKE 'BGD' ", conn)


Unnamed: 0,ID,Name,CountryCode,District,Population
0,150,Dhaka,BGD,Dhaka,3612850
1,151,Chittagong,BGD,Chittagong,1392860
2,152,Khulna,BGD,Khulna,663340
3,153,Rajshahi,BGD,Rajshahi,294056
4,154,Narayanganj,BGD,Dhaka,202134
5,155,Rangpur,BGD,Rajshahi,191398
6,156,Mymensingh,BGD,Dhaka,188713
7,157,Barisal,BGD,Barisal,170232
8,158,Tungi,BGD,Dhaka,168702
9,159,Jessore,BGD,Khulna,139710


In [None]:
pd.read_sql_query("SELECT * FROM country WHERE LifeExpectancy > 80 ", conn) # we can use filtering

  pd.read_sql_query("SELECT * FROM country WHERE LifeExpectancy > 80 ", conn) # we can use filtering


Unnamed: 0,Code,Name,Continent,Region,SurfaceArea,IndepYear,Population,LifeExpectancy,GNP,GNPOld,LocalName,GovernmentForm,HeadOfState,Capital,Code2
0,AND,Andorra,Europe,Southern Europe,468.0,1278.0,78000,83.5,1630.0,,Andorra,Parliamentary Coprincipality,,55,AD
1,JPN,Japan,Asia,Eastern Asia,377829.0,-660.0,126714000,80.7,3787042.0,4192638.0,Nihon/Nippon,Constitutional Monarchy,Akihito,1532,JP
2,MAC,Macao,Asia,Eastern Asia,18.0,,473000,81.6,5749.0,5940.0,Macau/Aomen,Special Administrative Region of China,Jiang Zemin,2454,MO
3,SGP,Singapore,Asia,Southeast Asia,618.0,1965.0,3567000,80.1,86503.0,96318.0,Singapore/Singapura/Xinjiapo/Singapur,Republic,Sellapan Rama Nathan,3208,SG
4,SMR,San Marino,Europe,Southern Europe,61.0,885.0,27000,81.1,510.0,,San Marino,Republic,,3171,SM


In [None]:
df = pd.read_sql_query("SELECT * FROM countrylanguage", conn)
df

  df = pd.read_sql_query("SELECT * FROM countrylanguage", conn)


Unnamed: 0,CountryCode,Language,IsOfficial,Percentage
0,ABW,Dutch,T,5.3
1,ABW,English,F,9.5
2,ABW,Papiamento,F,76.7
3,ABW,Spanish,F,7.4
4,AFG,Balochi,F,0.9
...,...,...,...,...
979,ZMB,Tongan,F,11.0
980,ZWE,English,T,2.2
981,ZWE,Ndebele,F,16.2
982,ZWE,Nyanja,F,2.2


# ---------------Export Data------------------
- to csv
- to excel
- to html
- to json
- to sql

In [None]:
df = pd.read_csv(r'E:\Learn_Data_Science\Data_Analysis_Process\Datasets_for_Data_Analysis_Process\deliveries.csv')
df.head(3)

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
0,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,1,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
1,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,2,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,0,0,0,,,
2,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,1,3,DA Warner,S Dhawan,TS Mills,0,...,0,0,0,0,4,0,4,,,


### to csv

In [None]:
### Find the batsman names and their total runs in the entire ipl, then export as csv file 

temp_df = df.groupby('batsman')['batsman_runs'].sum().reset_index()
temp_df

Unnamed: 0,batsman,batsman_runs
0,A Ashish Reddy,280
1,A Chandila,4
2,A Chopra,53
3,A Choudhary,25
4,A Dananjaya,4
...,...,...
511,YV Takawale,192
512,Yashpal Singh,47
513,Younis Khan,3
514,Yuvraj Singh,2765


In [None]:
temp_df.to_csv('batsman_total_runs.csv')

In [None]:
temp_df.to_csv('batsman_total_runs-NI.csv', index=False) # without index

### to excel

In [None]:
temp_df.to_excel('batsman_total_runs.xlsx')

In [None]:
temp_df.to_excel('batsman_total_runs.xlsx',sheet_name='batsman') # to save in specific sheet

In [None]:
temp_df2 = df.pivot_table(index='batsman',columns='bowling_team', values='batsman_runs', aggfunc='sum')
temp_df2

bowling_team,Chennai Super Kings,Deccan Chargers,Delhi Capitals,Delhi Daredevils,Gujarat Lions,Kings XI Punjab,Kochi Tuskers Kerala,Kolkata Knight Riders,Mumbai Indians,Pune Warriors,Rajasthan Royals,Rising Pune Supergiant,Rising Pune Supergiants,Royal Challengers Bangalore,Sunrisers Hyderabad
batsman,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
A Ashish Reddy,45.0,,,36.0,,37.0,,17.0,27.0,26.0,37.0,,,55.0,
A Chandila,,0.0,,,,,,,,,,,,4.0,
A Chopra,,35.0,,13.0,,2.0,,,1.0,,,,,2.0,
A Choudhary,,,,,15.0,4.0,,,,,,,,,6.0
A Dananjaya,,,,4.0,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
YV Takawale,19.0,5.0,,14.0,,0.0,,85.0,,,69.0,,,,
Yashpal Singh,,,,13.0,,,,,8.0,,26.0,,,,
Younis Khan,,,,,,3.0,,,,,,,,,
Yuvraj Singh,345.0,118.0,59.0,459.0,13.0,260.0,8.0,386.0,275.0,,384.0,47.0,23.0,340.0,48.0


In [None]:
# to write in existing excel file with multiple sheets 
with pd.ExcelWriter('batsman_total_runs.xlsx') as writer:
    temp_df.to_excel(writer, sheet_name= 'batsman&run')
    temp_df2.to_excel(writer, sheet_name= 'batsmanrun_vs_other_team')

### to html

In [None]:
## to html table or blog

In [None]:
df.query('batsman_runs == 6') # to filter out the rows where batsman_runs = 6

Unnamed: 0,match_id,inning,batting_team,bowling_team,over,ball,batsman,non_striker,bowler,is_super_over,...,bye_runs,legbye_runs,noball_runs,penalty_runs,batsman_runs,extra_runs,total_runs,player_dismissed,dismissal_kind,fielder
10,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,2,4,DA Warner,S Dhawan,A Choudhary,0,...,0,0,0,0,6,0,6,,,
47,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,8,4,MC Henriques,S Dhawan,TM Head,0,...,0,0,0,0,6,0,6,,,
75,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,13,2,Yuvraj Singh,MC Henriques,A Choudhary,0,...,0,0,0,0,6,0,6,,,
89,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,15,3,Yuvraj Singh,MC Henriques,S Aravind,0,...,0,0,0,0,6,0,6,,,
91,1,1,Sunrisers Hyderabad,Royal Challengers Bangalore,15,5,MC Henriques,Yuvraj Singh,S Aravind,0,...,0,0,0,0,6,0,6,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178987,11415,2,Chennai Super Kings,Mumbai Indians,6,4,SR Watson,SK Raina,SL Malinga,0,...,0,0,0,0,6,0,6,,,
179048,11415,2,Chennai Super Kings,Mumbai Indians,16,1,DJ Bravo,SR Watson,SL Malinga,0,...,0,0,0,0,6,0,6,,,
179061,11415,2,Chennai Super Kings,Mumbai Indians,18,2,SR Watson,DJ Bravo,KH Pandya,0,...,0,0,0,0,6,0,6,,,
179062,11415,2,Chennai Super Kings,Mumbai Indians,18,3,SR Watson,DJ Bravo,KH Pandya,0,...,0,0,0,0,6,0,6,,,


In [None]:
hdf=df.query('batsman_runs == 6').pivot_table(index='over', columns='ball', values='batsman_runs', aggfunc= 'count')
hdf

ball,1,2,3,4,5,6,7,8,9
over,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,7.0,12.0,27.0,31.0,24.0,20.0,12.0,1.0,
2,26.0,30.0,35.0,43.0,45.0,42.0,10.0,5.0,
3,63.0,46.0,57.0,52.0,48.0,58.0,8.0,2.0,1.0
4,49.0,61.0,51.0,81.0,54.0,53.0,11.0,1.0,
5,54.0,56.0,82.0,64.0,62.0,60.0,10.0,2.0,
6,61.0,82.0,44.0,65.0,56.0,66.0,11.0,1.0,
7,27.0,45.0,34.0,44.0,51.0,28.0,3.0,3.0,
8,44.0,47.0,55.0,49.0,53.0,39.0,7.0,,
9,70.0,56.0,56.0,58.0,52.0,35.0,11.0,,1.0
10,43.0,38.0,60.0,43.0,52.0,54.0,9.0,1.0,


In [None]:
hdf.to_html('sixes_hitmap.html')

### to json

In [None]:
df.groupby(['batting_team','batsman'])['batsman_runs'].sum().unstack()

batsman,A Ashish Reddy,A Chandila,A Chopra,A Choudhary,A Dananjaya,A Flintoff,A Hales,A Joseph,A Kumble,A Mishra,...,Y Nagar,Y Venugopal Rao,YA Abdulla,YK Pathan,YS Chahal,YV Takawale,Yashpal Singh,Younis Khan,Yuvraj Singh,Z Khan
batting_team,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Chennai Super Kings,,,,,,62.0,,,,,...,,,,,,,,,,
Deccan Chargers,35.0,,,,,,,,,84.0,...,,446.0,,,,,,,,
Delhi Capitals,,,,,,,,,,27.0,...,,,,,,,,,,
Delhi Daredevils,,,,,,,,,,164.0,...,285.0,468.0,,,,,,,248.0,10.0
Gujarat Lions,,,,,,,,,,,...,,,,,,,,,,
Kings XI Punjab,,,,,,,,,,,...,,,0.0,,,,,,964.0,
Kochi Tuskers Kerala,,,,,,,,,,,...,,,,,,,,,,
Kolkata Knight Riders,,,53.0,,,,,,,,...,,,,1893.0,,,47.0,,,
Mumbai Indians,,,,,4.0,,,15.0,,,...,,,,,,88.0,,,108.0,40.0
Pune Warriors,,,,,,,,,,,...,,,,,,,,,581.0,


In [None]:
df.groupby(['batting_team','batsman'])['batsman_runs'].sum().unstack().to_json('ipl.json')

### to sql

In [None]:
import pymysql # library for mysql
from sqlalchemy import create_engine # universal SQL toolkit and ORM

In [None]:
engin = create_engine("mysql+pymysql://root:@localhost/ipl") #{user=root}:{password= }@{url=localhost}/{database=ipl}
df.to_sql('ipl_deliveries', con= engin, if_exists= 'append') #

179078

In [None]:
temp_df.to_sql('ipl_batsman_runs', con= engin, if_exists= 'append') #export another one

516

# -----Import data from website using API-----

In [None]:
import requests # to extract data from an url

In [None]:
response= requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page=1')

In [None]:
response.json()

{'page': 1,
 'results': [{'adult': False,
   'backdrop_path': '/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg',
   'genre_ids': [18, 80],
   'id': 278,
   'original_language': 'en',
   'original_title': 'The Shawshank Redemption',
   'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
   'popularity': 41.2793,
   'poster_path': '/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg',
   'release_date': '1994-09-23',
   'title': 'The Shawshank Redemption',
   'video': False,
   'vote_average': 8.7,
   'vote_count': 28027},
  {'adult': False,
   'backdrop_path': '/tmU7GeKVybMWFButWEGl2M4GeiP.jpg',
   'genre_ids': [18, 80],
   'id': 238,
   'original_language': 'en',
   'or

In [None]:
response.json()['results']

[{'adult': False,
  'backdrop_path': '/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg',
  'genre_ids': [18, 80],
  'id': 278,
  'original_language': 'en',
  'original_title': 'The Shawshank Redemption',
  'overview': 'Imprisoned in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.',
  'popularity': 41.2793,
  'poster_path': '/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg',
  'release_date': '1994-09-23',
  'title': 'The Shawshank Redemption',
  'video': False,
  'vote_average': 8.7,
  'vote_count': 28027},
 {'adult': False,
  'backdrop_path': '/tmU7GeKVybMWFButWEGl2M4GeiP.jpg',
  'genre_ids': [18, 80],
  'id': 238,
  'original_language': 'en',
  'original_title': 'The Godfather',
  'overview

In [None]:
df = pd.DataFrame(response.json()['results'])
df

Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/zfbjgQE1uSd9wiPTX4VzsLi0rGG.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,41.2793,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.7,28027
1,False,/tmU7GeKVybMWFButWEGl2M4GeiP.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",43.2585,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.687,21244
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,17.6014,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.569,12832
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,23.3275,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.565,16300
4,False,/bxgTSUenZDHNFerQ1whRKplrMKF.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,12.3068,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.549,9025
5,False,/6oaL4DP75yABrd5EbC4H2zq5ghc.jpg,"[16, 10751, 14]",129,ja,千と千尋の神隠し,"A young girl, Chihiro, becomes trapped in a st...",30.1209,/39wmItIWsg5sZMyRUHLkWBcuVCM.jpg,2001-07-20,Spirited Away,False,8.538,16972
6,False,/90ez6ArvpO8bvpyIngBuwXOqJm5.jpg,"[35, 18, 10749]",19404,hi,दिलवाले दुल्हनिया ले जायेंगे,"Raj is a rich, carefree, happy-go-lucky second...",5.891,/lfRkUr7DYdHldAqi3PwdQGBRBPM.jpg,1995-10-20,Dilwale Dulhania Le Jayenge,False,8.519,4473
7,False,/oOv2oUXcAaNXakRqUPxYq5lJURz.jpg,"[18, 28, 80, 53]",155,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,30.945,/qJ2tW6WMUDux911r6m7haRef0WH.jpg,2008-07-16,The Dark Knight,False,8.519,33638
8,False,/vxJ08SvwomfKbpboCWynC3uqUg4.jpg,"[14, 18, 80]",497,en,The Green Mile,A supernatural tale set on death row in a Sout...,17.3965,/8VG8fDNiy50H4FedGwdSVUPoaJe.jpg,1999-12-10,The Green Mile,False,8.504,17965
9,False,/hiKmpZMGZsrkA3cdce8a7Dpos1j.jpg,"[35, 53, 18]",496243,ko,기생충,"All unemployed, Ki-taek's family takes peculia...",23.9818,/7IiTTgloJzvGI1TAYymCfbfl3vT.jpg,2019-05-30,Parasite,False,8.501,18875


In [None]:
df.shape

(20, 14)

In [None]:
temp_df = df[['id','title','overview','release_date','popularity','vote_average','vote_count']]
temp_df

Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,278,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,1994-09-23,41.2793,8.7,28027
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,43.2585,8.687,21244
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20,17.6014,8.569,12832
3,424,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,23.3275,8.565,16300
4,389,12 Angry Men,The defense and the prosecution have rested an...,1957-04-10,12.3068,8.549,9025
5,129,Spirited Away,"A young girl, Chihiro, becomes trapped in a st...",2001-07-20,30.1209,8.538,16972
6,19404,Dilwale Dulhania Le Jayenge,"Raj is a rich, carefree, happy-go-lucky second...",1995-10-20,5.891,8.519,4473
7,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,2008-07-16,30.945,8.519,33638
8,497,The Green Mile,A supernatural tale set on death row in a Sout...,1999-12-10,17.3965,8.504,17965
9,496243,Parasite,"All unemployed, Ki-taek's family takes peculia...",2019-05-30,23.9818,8.501,18875


In [None]:
### using loop to fetch entire data

# Empty list to store data before converting to DataFrame
data_list = []

for i in range(1,505):
    response= requests.get('https://api.themoviedb.org/3/movie/top_rated?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US&page={}'.format(i))
    if response.status_code == 200:  # Check if request was successful
        json_data = response.json()
        tempp_df = pd.DataFrame(json_data['results'])  # Extract movie data
        
        # Select only required columns
        tempp_df = tempp_df[['id', 'title', 'overview', 'release_date', 'popularity', 'vote_average', 'vote_count']]
        
        # Store data in list
        data_list.append(temp_df)
    else:
        print(f"Failed to fetch page {i}, Status Code: {response.status_code}")

# Combine all dataframes using concat()
ndf = pd.concat(data_list, ignore_index=True)

Failed to fetch page 501, Status Code: 400
Failed to fetch page 502, Status Code: 400
Failed to fetch page 503, Status Code: 400
Failed to fetch page 504, Status Code: 400


In [None]:
ndf

Unnamed: 0,id,title,overview,release_date,popularity,vote_average,vote_count
0,278,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,1994-09-23,41.2793,8.700,28027
1,238,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",1972-03-14,43.2585,8.687,21244
2,240,The Godfather Part II,In the continuing saga of the Corleone crime f...,1974-12-20,17.6014,8.569,12832
3,424,Schindler's List,The true story of how businessman Oskar Schind...,1993-12-15,23.3275,8.565,16300
4,389,12 Angry Men,The defense and the prosecution have rested an...,1957-04-10,12.3068,8.549,9025
...,...,...,...,...,...,...,...
9995,769,GoodFellas,"The true story of Henry Hill, a half-Irish, ha...",1990-09-12,15.4170,8.500,13237
9996,346,Seven Samurai,A samurai answers a village's request for prot...,1954-04-26,6.2470,8.500,3846
9997,157336,Interstellar,The adventures of a group of explorers who mak...,2014-11-05,41.6271,8.500,36840
9998,12477,Grave of the Fireflies,"In the final months of World War II, 14-year-o...",1988-04-16,0.0149,8.450,5813


In [None]:
ndf.to_csv('bigDmovies.csv')

In [None]:
#### Practice your own pace - use rapidapi to get different datasets

# ---Import data from website using Web Scraping---

In [1]:
############################### {{Have to learn and understand}} #####################################

In [None]:
import pandas as pd
import requests # to extract data from an url
from bs4 import BeautifulSoup # this library is used for web scraping

In [3]:
requests.get(r'https://www.ambitionbox.com/list-of-companies?page=1') # access denied

<Response [403]>

In [4]:
requests.get(r'https://www.ambitionbox.com/list-of-companies?page=1').text  # access denied

'<HTML><HEAD>\n<TITLE>Access Denied</TITLE>\n</HEAD><BODY>\n<H1>Access Denied</H1>\n \nYou don\'t have permission to access "http&#58;&#47;&#47;www&#46;ambitionbox&#46;com&#47;list&#45;of&#45;companies&#63;" on this server.<P>\nReference&#32;&#35;18&#46;d44c3917&#46;1743610738&#46;45b846f6\n<P>https&#58;&#47;&#47;errors&#46;edgesuite&#46;net&#47;18&#46;d44c3917&#46;1743610738&#46;45b846f6</P>\n</BODY>\n</HTML>\n'

#### If response code is 403 and to get rid from this access denied problem we will use this method
 - headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
 - requests.get('url',headers=headers).text

In [None]:
### for checking first few pages to ensure that we can use the below code structure to fetch data 
headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
for page in range(1, 6):  # Test with 5 pages first
    try:
        response = requests.get(f'https://www.ambitionbox.com/list-of-companies?page={page}', headers=headers, timeout=10)
        print(f"Page {page} fetched. Status Code: {response.status_code}")
        if response.status_code != 200:
            print("Blocked or error occurred. Stopping...")
            break
    except requests.exceptions.RequestException as e:
        print(f"Request failed: {e}")

Request failed: HTTPSConnectionPool(host='www.ambitionbox.com', port=443): Read timed out. (read timeout=10)
Request failed: HTTPSConnectionPool(host='www.ambitionbox.com', port=443): Read timed out. (read timeout=10)
Request failed: HTTPSConnectionPool(host='www.ambitionbox.com', port=443): Read timed out. (read timeout=10)
Request failed: HTTPSConnectionPool(host='www.ambitionbox.com', port=443): Read timed out. (read timeout=10)
Request failed: HTTPSConnectionPool(host='www.ambitionbox.com', port=443): Read timed out. (read timeout=10)


In [None]:
### this is a general structure

# headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36'}
#webpage = requests.get(r'https://www.ambitionbox.com/list-of-companies?page=1',headers=headers).text

### it will fetch full html-css code of the whole website

# ------

### Using a demo website (scraping sandbox)

In [3]:
import pandas as pd
import requests # to extract data from an url
from bs4 import BeautifulSoup # this library is used for web scraping

In [4]:
webpage = requests.get(r'https://sandbox.oxylabs.io/products?_gl=1*1lvchq7*_gcl_au*MTk0ODk2NTAzNS4xNzQzNjM5OTE0&page=1').text

In [None]:
with open('demo-file.html','w') as file: # save the html file for future
    file.write(webpage)

In [6]:
soup = BeautifulSoup(webpage, 'lxml' ) # 'lxml' is a html parcer/tools to retrive data

In [None]:
print(soup.prettify()) # to format correctly/prettyly

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width" name="viewport"/>
  <title>
   E-commerce	| Oxylabs Scraping Sandbox
  </title>
  <meta name="description"/>
  <meta content="E-commerce	| Oxylabs Scraping Sandbox" property="og:title"/>
  <meta property="og:description"/>
  <meta property="og:developer"/>
  <meta property="og:platform"/>
  <meta property="og:type"/>
  <meta property="og:currency"/>
  <meta property="og:price"/>
  <meta content="https://sandbox.oxylabs.ioundefined" property="og:image"/>
  <meta property="og:availability"/>
  <meta property="og:genre"/>
  <meta content="website" property="og:type"/>
  <link href="/favicon.ico" rel="icon"/>
  <meta content="16" name="next-head-count"/>
  <link href="/favicon.ico" rel="shortcut icon"/>
  <link as="font" crossorigin="" href="/fonts/Avalon-Book.woff" rel="preload" type="font/woff"/>
  <link as="font" crossorigin="" href="/fonts/Avalon-Demi.woff" rel="preload" type="font/w

In [None]:
soup.find_all('h4') # to print all the 'h4' tags on the website

[<h4 class="title css-7u5e79 eag3qlw7">The Legend of Zelda: Ocarina of Time</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Super Mario Galaxy</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Super Mario Galaxy 2</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Metroid Prime</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Super Mario Odyssey</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Halo: Combat Evolved</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">The House in Fata Morgana - Dreams of the Revenants Edition -</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">NFL 2K1</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Uncharted 2: Among Thieves</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Tekken 3</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">The Legend of Zelda: The Wind Waker</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Gran Turismo</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Metal Gear Solid 2: Sons of Liberty</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Grand Theft Auto Double Pack</h4>,
 <h4 class="title 

In [None]:
soup.find_all('h4')[0:5] # to fetch first five item

[<h4 class="title css-7u5e79 eag3qlw7">The Legend of Zelda: Ocarina of Time</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Super Mario Galaxy</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Super Mario Galaxy 2</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Metroid Prime</h4>,
 <h4 class="title css-7u5e79 eag3qlw7">Super Mario Odyssey</h4>]

In [None]:
soup.find_all('h4')[0].text # to fetch the string only (game title), but it works with only one element

'The Legend of Zelda: Ocarina of Time'

In [None]:
## # to fetch the string only (all game title), this process works with multiple elements
elements = soup.find_all('h4')

for i in elements:
    print(i.text.strip())

The Legend of Zelda: Ocarina of Time
Super Mario Galaxy
Super Mario Galaxy 2
Metroid Prime
Super Mario Odyssey
Halo: Combat Evolved
The House in Fata Morgana - Dreams of the Revenants Edition -
NFL 2K1
Uncharted 2: Among Thieves
Tekken 3
The Legend of Zelda: The Wind Waker
Gran Turismo
Metal Gear Solid 2: Sons of Liberty
Grand Theft Auto Double Pack
Baldur's Gate II: Shadows of Amn
Tetris Effect: Connected
The Legend of Zelda Collector's Edition
Gran Turismo 3: A-Spec
The Legend of Zelda: A Link to the Past
The Legend of Zelda: Majora's Mask
The Last of Us
Persona 5 Royal
The Last of Us Remastered
The Legend of Zelda: Ocarina of Time 3D
Chrono Cross
Gears of War
Sid Meier's Civilization II
Halo 3
Ninja Gaiden Black
Super Mario Advance 4: Super Mario Bros. 3
Jet Grind Radio
Grim Fandango


In [None]:
len(soup.find_all('h4')) # there are 32 items in first page

32

In [None]:
pelements = soup.find_all('p')

for i in pelements:                            # there are problems, because there are many things that use p tag
    print(i.text.strip()) 

Game platforms:
3000 results - showing 1 to 32
Action Adventure Fantasy
As a young boy, Link is tricked by Ganondorf, the King of the Gerudo Thieves. The evil human uses Link to gain access to the Sacred Realm, where he places his tainted hands on Triforce and transforms the beautiful Hyrulean landscape into a barren wasteland. Link is determined to fix the problems he helped to create, so with the help of Rauru he travels through time gathering the powers of the Seven Sages.
Action Platformer 3D
[Metacritic's 2007 Wii Game of the Year] The ultimate Nintendo hero is taking the ultimate step ... out into space. Join Mario as he ushers in a new era of video games, defying gravity across all the planets in the galaxy. When some creature escapes into space with Princess Peach, Mario gives chase, exploring bizarre planets all across the galaxy. Mario, Peach and enemies new and old are here. Players run, jump and battle enemies as they explore all the planets in the galaxy. Since this game m

In [25]:
## to get rid from this problem, we will use Class attribute to identify p uniquely

pelements = soup.find_all('p', class_= 'category') # to specify the tags/category

for i in pelements:                            
    print(i.text.strip()) 

Action Adventure Fantasy
Action Platformer 3D
Action Platformer 3D
Action Shooter First-Person Sci-Fi
Action Platformer 3D
Action Shooter First-Person Sci-Fi
Adventure Visual Novel
Sports Traditional Football Sim
Action Adventure Modern Linear
Action Fighting 3D
Action Adventure Fantasy
Driving Racing GT / Street
Action Adventure Modern
Miscellaneous Compilation
Role-Playing PC-style RPG Western-Style
Puzzle Stacking
Action Adventure Fantasy
Driving Racing GT / Street
Action Adventure Fantasy
Action Adventure Fantasy
Modern Adventure General Action Adventure
Role-Playing Japanese-Style
Action Adventure General Modern
Miscellaneous Fantasy Compilation Action Adventure Open-World
Role-Playing Console-style RPG
Action Shooter Third-Person Sci-Fi Arcade
Strategy Turn-Based Historic General 4X
Action Shooter First-Person Sci-Fi Arcade
Action Adventure Fantasy
Action Platformer 2D
Action Platformer 3D
Adventure General 3D Third-Person Fantasy


In [None]:
len(soup.find_all('p', class_= 'category')) # there are 32 items in first page

32

#### From here, we will approach different way  ----------vvv

In [30]:
game = soup.find_all('div', class_= 'product-card')

In [31]:
len(game)

32

In [None]:
## now we run loops on each games, -------this is the correct approach

for i in game:
    h4_elements = i.find_all('h4')  # Returns a list of <h4> elements
    for h4 in h4_elements:  
        print(h4.text.strip())  # Extract text and remove extra spaces

The Legend of Zelda: Ocarina of Time
Super Mario Galaxy
Super Mario Galaxy 2
Metroid Prime
Super Mario Odyssey
Halo: Combat Evolved
The House in Fata Morgana - Dreams of the Revenants Edition -
NFL 2K1
Uncharted 2: Among Thieves
Tekken 3
The Legend of Zelda: The Wind Waker
Gran Turismo
Metal Gear Solid 2: Sons of Liberty
Grand Theft Auto Double Pack
Baldur's Gate II: Shadows of Amn
Tetris Effect: Connected
The Legend of Zelda Collector's Edition
Gran Turismo 3: A-Spec
The Legend of Zelda: A Link to the Past
The Legend of Zelda: Majora's Mask
The Last of Us
Persona 5 Royal
The Last of Us Remastered
The Legend of Zelda: Ocarina of Time 3D
Chrono Cross
Gears of War
Sid Meier's Civilization II
Halo 3
Ninja Gaiden Black
Super Mario Advance 4: Super Mario Bros. 3
Jet Grind Radio
Grim Fandango


In [None]:
## alternative of above code
for i in game:
    print(i.find('h4').text.strip()) # make single element as text at a time, so it runs correctly
    

The Legend of Zelda: Ocarina of Time
Super Mario Galaxy
Super Mario Galaxy 2
Metroid Prime
Super Mario Odyssey
Halo: Combat Evolved
The House in Fata Morgana - Dreams of the Revenants Edition -
NFL 2K1
Uncharted 2: Among Thieves
Tekken 3
The Legend of Zelda: The Wind Waker
Gran Turismo
Metal Gear Solid 2: Sons of Liberty
Grand Theft Auto Double Pack
Baldur's Gate II: Shadows of Amn
Tetris Effect: Connected
The Legend of Zelda Collector's Edition
Gran Turismo 3: A-Spec
The Legend of Zelda: A Link to the Past
The Legend of Zelda: Majora's Mask
The Last of Us
Persona 5 Royal
The Last of Us Remastered
The Legend of Zelda: Ocarina of Time 3D
Chrono Cross
Gears of War
Sid Meier's Civilization II
Halo 3
Ninja Gaiden Black
Super Mario Advance 4: Super Mario Bros. 3
Jet Grind Radio
Grim Fandango


In [67]:
name = []
category = []
cost = []
available = []

for i in game:
    name.append(i.find('h4').text.strip()) # append in the empty list 'name'

    category.append(i.find('p', class_= 'category').text.strip())

    cost.append(i.find('div', class_= 'price-wrapper').text.strip())

In [68]:
# name
# category
cost


['91,99 €',
 '91,99 €',
 '91,99 €',
 '89,99 €',
 '89,99 €',
 '87,99 €',
 '83,99 €',
 '62,99 €',
 '88,99 €',
 '91,99 €',
 '90,99 €',
 '86,99 €',
 '88,99 €',
 '81,99 €',
 '91,99 €',
 '88,99 €',
 '89,99 €',
 '84,99 €',
 '90,99 €',
 '91,99 €',
 '92,99 €',
 '84,99 €',
 '92,99 €',
 '90,99 €',
 '88,99 €',
 '84,99 €',
 '88,99 €',
 '81,99 €',
 '88,99 €',
 '89,99 €',
 '83,99 €',
 '91,99 €']

In [None]:
d = {
    'Title': name,
    'Tag': category,
    'Price': cost
}

df =pd.DataFrame(d) # this is the dataframe for first page which has 32 rows/items
df

Unnamed: 0,Title,Tag,Price
0,The Legend of Zelda: Ocarina of Time,Action Adventure Fantasy,"91,99 €"
1,Super Mario Galaxy,Action Platformer 3D,"91,99 €"
2,Super Mario Galaxy 2,Action Platformer 3D,"91,99 €"
3,Metroid Prime,Action Shooter First-Person Sci-Fi,"89,99 €"
4,Super Mario Odyssey,Action Platformer 3D,"89,99 €"
5,Halo: Combat Evolved,Action Shooter First-Person Sci-Fi,"87,99 €"
6,The House in Fata Morgana - Dreams of the Reve...,Adventure Visual Novel,"83,99 €"
7,NFL 2K1,Sports Traditional Football Sim,"62,99 €"
8,Uncharted 2: Among Thieves,Action Adventure Modern Linear,"88,99 €"
9,Tekken 3,Action Fighting 3D,"91,99 €"


In [80]:
### to create dataframe for 94 page where total items/rows are 3000

final_df = pd.DataFrame()

for page in range(1,95):
    url = f'https://sandbox.oxylabs.io/products?_gl=1*1lvchq7*_gcl_au*MTk0ODk2NTAzNS4xNzQzNjM5OTE0&page={page}'

    webpage = requests.get(url).text
    soup = BeautifulSoup(webpage, 'lxml')
    game = soup.find_all('div', class_= 'product-card')
    name = []
    category = []
    cost = []
    available = []

    for i in game:
        name.append(i.find('h4').text.strip()) # append in the empty list 'name'

        category.append(i.find('p', class_= 'category').text.strip())

        cost.append(i.find('div', class_= 'price-wrapper').text.strip())
    d = {
    'Title': name,
    'Tag': category,
    'Price': cost
    }

    df =pd.DataFrame(d)
    final_df = pd.concat([final_df, df], ignore_index=True)

In [82]:
final_df # this is the final output

Unnamed: 0,Title,Tag,Price
0,The Legend of Zelda: Ocarina of Time,Action Adventure Fantasy,"91,99 €"
1,Super Mario Galaxy,Action Platformer 3D,"91,99 €"
2,Super Mario Galaxy 2,Action Platformer 3D,"91,99 €"
3,Metroid Prime,Action Shooter First-Person Sci-Fi,"89,99 €"
4,Super Mario Odyssey,Action Platformer 3D,"89,99 €"
...,...,...,...
2995,Crashday,Driving Racing Arcade Automobile,"76,99 €"
2996,The Con,Action Fighting 3D,"72,99 €"
2997,Van Helsing,Action Shooter Third-Person Fantasy,"76,99 €"
2998,Rogue Ops,Action Adventure Modern,"63,99 €"


In [83]:
final_df.head(50)

Unnamed: 0,Title,Tag,Price
0,The Legend of Zelda: Ocarina of Time,Action Adventure Fantasy,"91,99 €"
1,Super Mario Galaxy,Action Platformer 3D,"91,99 €"
2,Super Mario Galaxy 2,Action Platformer 3D,"91,99 €"
3,Metroid Prime,Action Shooter First-Person Sci-Fi,"89,99 €"
4,Super Mario Odyssey,Action Platformer 3D,"89,99 €"
5,Halo: Combat Evolved,Action Shooter First-Person Sci-Fi,"87,99 €"
6,The House in Fata Morgana - Dreams of the Reve...,Adventure Visual Novel,"83,99 €"
7,NFL 2K1,Sports Traditional Football Sim,"62,99 €"
8,Uncharted 2: Among Thieves,Action Adventure Modern Linear,"88,99 €"
9,Tekken 3,Action Fighting 3D,"91,99 €"


In [84]:
final_df.shape

(3000, 3)