In [1]:
import pandas as pd
import numpy as np

# Web Scraping using Beautifulsoup
Web Scraping means to collect data from the Internet. As a beginner in data science, you must have seen CSV files on the Internet distributed by some popular websites like Kaggle and other govt websites. The data is prepared by either collecting and writing using standard methods or by scraping it from the Internet.
Web Scraping to Create a CSV File
So we need two primary packages for this task, BeautifulSoup and urllib. We can easily install both these packages using the pip command - pip install bs4 and pip install urllib.

In [2]:
# I will scrape data from Flipkart and create a CSV file from that data. It's not that difficult what it seems. Let's get
# our hands dirty with web scraping to create a CSV file using python. I will start by importing the necessary packages 
# that we need for this task. So let's get started.
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url="https://www.flipkart.com/search?q=samsung+mobiles&sid=tyy%2C4io&as=on&as-show=on&otracker=AS_QueryStore_HistoryAutoSuggest_0_2&otracker1=AS_QueryStore_HistoryAutoSuggest_0_2&as-pos=0&as-type=HISTORY&as-searchtext=sa"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# Now let's see how many HTML containers are present in this link:
containers = page_soup.findAll("div", { "class": "_3E8aIl X3IECw row"})
print(len(containers))
containers

1


[<div class="_3E8aIl X3IECw row"><div class="_88-NUv">Reviews for Popular Mobiles</div><div class="_2nRPpA"><div class="_1Ni40J"><div class="_3hVUcF"><div class="CXW8mj" style="height:150px;width:150px"><img alt="SAMSUNG Galaxy M12 (Blue, 64 GB)" class="_396cs4" src="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/img/placeholder_fcebae.svg"/></div></div><div class="_1kLt05"><a href="/samsung-galaxy-m12-blue-64-gb/p/itm062b6548d8904?pid=MOBGFG8GCPEGKGF4&amp;marketplace=FLIPKART" rel="noopener noreferrer" target="_blank"><div class="_1W9f5C"><div>1. SAMSUNG Galaxy M12 (Blue, 6...</div></div><div class="_3VDxyD"><div class="_3LWZlK">4.2</div><span class="_34hpFu"><span>

In [3]:
page_soup

<!DOCTYPE html>
<html lang="en"><head><link href="https://rukminim1.flixcart.com" rel="preconnect"/><link href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app_modules.chunk.94b5e7.css" rel="stylesheet"/><link href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app.chunk.6e7580.css" rel="stylesheet"/><meta content="text/html; charset=utf-8" http-equiv="Content-type"/><meta content="IE=Edge" http-equiv="X-UA-Compatible"/><meta content="102988293558" property="fb:page_id"/><meta content="658873552,624500995,100000233612389" property="fb:admins"/><meta content="noodp" name="robots"/><link href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico" rel="shortcut icon"/><link href="/osdd.xml?v=2" rel="search" type="application/opensearchdescription+xml"/><meta content="website" property="og:type"/><meta content="Flipkart.com" name="og_site_name" property="og:site_name"/><link href="/apple-touch-icon-57x57.png" rel="apple-to

In [4]:
containers = page_soup.findAll("div", { "class": "_2nRPpA"})
print(len(containers))
# print(soup.prettify(containers[0]))
containers

5


[<div class="_2nRPpA"><div class="_1Ni40J"><div class="_3hVUcF"><div class="CXW8mj" style="height:150px;width:150px"><img alt="SAMSUNG Galaxy M12 (Blue, 64 GB)" class="_396cs4" src="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/img/placeholder_fcebae.svg"/></div></div><div class="_1kLt05"><a href="/samsung-galaxy-m12-blue-64-gb/p/itm062b6548d8904?pid=MOBGFG8GCPEGKGF4&amp;marketplace=FLIPKART" rel="noopener noreferrer" target="_blank"><div class="_1W9f5C"><div>1. SAMSUNG Galaxy M12 (Blue, 6...</div></div><div class="_3VDxyD"><div class="_3LWZlK">4.2</div><span class="_34hpFu"><span>14,565 Ratings</span><span><span class="_2oY1qB">&amp;</span><span>1,080 Reviews</span

In [5]:
# So we have Samsung Galaxy M01 smartphone with blue colour as the first item on the Flipkart webpage that 
# we have scrapped. Now let's have a look at the price of this smartphone:
container = containers[0]
image = container.findAll("div", {"class": "CXW8mj"})
image[0].img['alt']

'SAMSUNG Galaxy M12 (Blue, 64 GB)'

In [6]:
# Now let's have a look at the price of this smartphone:
price = container.findAll("div", {"class": "_30jeq3 UMT9wN"})
print(price[0].text)


₹10,285


In [7]:
# Now let's have a look at its ratings from its customers:
ratings = container.findAll("span", {"class": "_34hpFu"})
print(ratings[0].text)

14,565 Ratings&1,080 Reviews


In [8]:
description = container.findAll("span", {"class": "_34hpFu"})
print(description[0].text)

14,565 Ratings&1,080 Reviews


In [9]:
#  mmm=  pd.read_json

In [10]:
# Now let's have a look at what our CSV file has stored after the web scraping of Flipkart:
products = []
pricing = []
rate = []
description = []
    
for container in containers:
    image = container.findAll("div", {"class": "CXW8mj"})
    product_name = image[0].img['alt']
    products.append(product_name)
    price_container = container.findAll("div", {"class": "_30jeq3 UMT9wN"})
    price = price_container[0].text.strip()
    pricing.append(price)
    rating_container = container.findAll("span", {"class": "_34hpFu"})
    rating = rating_container[0].text
    rate.append(rating)
    desc = container.findAll("ul", {"class": "_1Sq2Fs"})
    product_desc = desc[0].text
    description.append(product_desc)
    
    print("Product_Name:"+ product_name)
    print("Price: " + price)
    print("Ratings:" + rating)
    print("description:" + product_desc)

Product_Name:SAMSUNG Galaxy M12 (Blue, 64 GB)
Price: ₹10,285
Ratings:14,565 Ratings&1,080 Reviews
description:4 GB RAM | 64 GB ROM16.51 cm (6.5 inch) Display48MP Rear Camera
Product_Name:SAMSUNG Guru Music 2
Price: ₹2,250
Ratings:1,88,668 Ratings&17,803 Reviews
description:NA ROM5.08 cm (2 inch) NA Display800 mAh Battery
Product_Name:SAMSUNG Guru Music 2 SM-B315E
Price: ₹1,990
Ratings:8,059 Ratings&591 Reviews
description:Expandable Upto 16 GB5.08 cm (2 inch) QVGA Display0MP Front Camera
Product_Name:SAMSUNG Galaxy F23 5G (Aqua Blue, 128 GB)
Price: ₹16,999
Ratings:27,642 Ratings&2,828 Reviews
description:6 GB RAM | 128 GB ROM | Expandable Upto 1 TB16.76 cm (6.6 inch) Full HD+ Display50MP + 8MP + 2MP | 8MP Front Camera
Product_Name:SAMSUNG Galaxy F22 (Denim Black, 128 GB)
Price: ₹13,999
Ratings:44,063 Ratings&4,124 Reviews
description:6 GB RAM | 128 GB ROM | Expandable Upto 1 TB16.26 cm (6.4 inch) HD+ Display48MP + 8MP + 2MP + 2MP | 13MP Front Camera


In [11]:
# Now let's create a CSV file and store all the mobile phones with their name, price and ratings:
# filename = "products.csv"
# f = open(filename, "w")
# headers = "Product_Name, Pricing, Ratings \n"
# f.write(headers)
df = pd.DataFrame({"Product_Name":products, "Pricing":price, "Rating":rate, "Description":description})
df.to_csv("Samsung_mobile.csv")
samsung = pd.read_csv("Samsung_mobile.csv")
samsung.head()
# df.head(10)
# df1 = df[10].str.split("", expand=True)
# df1.head(10)
# df1[10] = df1[10].str.strip("[")
# col_labels = soup.find_all("th")
# all_header = []
# col_str = str(col_labels)
# cleantext2 = BeautifulSoup(col_str, "lxml").get_text()
# all_header.append(cleantext2)
# df2 = pd.DataFrame(all_header)
# df3 = df2[0].str.split("", expand=True)
# frames = [df3, df1]
# df4 = pd.concat(frames)
# df5 = df4.rename(columns=df4.iloc[0])
# df5.head()

Unnamed: 0.1,Unnamed: 0,Product_Name,Pricing,Rating,Description
0,0,"SAMSUNG Galaxy M12 (Blue, 64 GB)","₹13,999","14,565 Ratings&1,080 Reviews",4 GB RAM | 64 GB ROM16.51 cm (6.5 inch) Displa...
1,1,SAMSUNG Guru Music 2,"₹13,999","1,88,668 Ratings&17,803 Reviews",NA ROM5.08 cm (2 inch) NA Display800 mAh Battery
2,2,SAMSUNG Guru Music 2 SM-B315E,"₹13,999","8,059 Ratings&591 Reviews",Expandable Upto 16 GB5.08 cm (2 inch) QVGA Dis...
3,3,"SAMSUNG Galaxy F23 5G (Aqua Blue, 128 GB)","₹13,999","27,642 Ratings&2,828 Reviews",6 GB RAM | 128 GB ROM | Expandable Upto 1 TB16...
4,4,"SAMSUNG Galaxy F22 (Denim Black, 128 GB)","₹13,999","44,063 Ratings&4,124 Reviews",6 GB RAM | 128 GB ROM | Expandable Upto 1 TB16...


In [12]:
# EXample 2: Let's also scrap another section of the website.
my_url = "https://www.flipkart.com/ckf/czl/~cs-argz57cui9/pr?sid=ckf,czl&collection-tab-name=Big+Screen+PMU&wid=11.productCard.PMU_V2_10"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# Now let's see how many HTML containers are present in this link:
containers = page_soup.findAll("div", { "class": "_1YokD2 _3Mn1Gg"})
print(len(containers))

1


In [13]:
my_url = "https://www.flipkart.com/clothing-and-accessories/topwear/pr?sid=clo%2Cash&otracker=categorytree&p%5B%5D=facets.ideal_for%255B%255D%3DMen&otracker=nmenu_sub_Men_0_Top%20wear"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# Now let's see how many HTML containers are present in this link:
containers = page_soup.findAll("div", { "class": "_1YokD2 _3Mn1Gg"})
print(len(containers))
containers

1


[<div class="_1YokD2 _3Mn1Gg" style="flex-grow:1;overflow:auto"><div class="_1YokD2 _2GoDe3 col-12-12" style="background-color:#ffffff;align-items:flex-end"><div class="_1AtVbE" style="flex-grow:1;overflow:auto"><div class="W_R1IA"><div class="_1MR4o5"><div class="_3GIHBu"><a class="_2whKao" href="/">Home</a><svg class="_39X-Og" height="27" viewbox="0 0 16 27" width="16" xmlns="http://www.w3.org/2000/svg"><path class="DpXnhQ" d="M16 23.207L6.11 13.161 16 3.093 12.955 0 0 13.161l12.955 13.161z" fill="#fff"></path></svg></div><div class="_3GIHBu"><a class="_2whKao" href="/clothing-and-accessories/pr?sid=clo&amp;marketplace=FLIPKART">Clothing and Accessories</a><svg class="_39X-Og" height="27" viewbox="0 0 16 27" width="16" xmlns="http://www.w3.org/2000/svg"><path class="DpXnhQ" d="M16 23.207L6.11 13.161 16 3.093 12.955 0 0 13.161l12.955 13.161z" fill="#fff"></path></svg></div><div class="_3GIHBu"><a class="_2whKao" href="/clothing-and-accessories/topwear/pr?sid=clo,ash&amp;marketplace=FL

In [14]:
# print div for the first container
# print(soup.prettify(containers[0]))

In [15]:
# Let us crap information for one product first
product_div = page_soup.findAll("a", { "class": "_1fQZEK"})
product_detail = product_div[0]
print(soup.prettify(product_detail))

IndexError: list index out of range

In [None]:
# Let get the name of the product
product_detail.findAll("div", {"class": "_4rR01T"})[0].text

In [None]:
# Let get the description of the product
product_detail.findAll("ul", {"class": "_1xgFaf"})[0].text

In [None]:
# Let get the price of the product
product_detail.findAll("div", {"class": "_30jeq3 _1_WHN1"})[0].text

In [None]:
# Let get the rating of the product
rate =product_detail.findAll("span", {"class": "_2_R_DZ"})[0].text.split()
rate[0]+" "+rate[1]

In [None]:
# Now that we are able to extract details for one product, let's now loop for other products
products = []
pricing = []
rating = []
description = []
containers = page_soup.findAll("div", { "class": "_1YokD2 _3Mn1Gg"})
for container in containers[1]:
    for val in container.findAll("div", {"class": "_4rR01T"}):
        if len(val)!=0:
           #     append product name for each
            products.append(val.text)
            print(val.text)
            #     append product description for each
            description.append(container.findAll("ul", {"class": "_1xgFaf"})[0].text)
            print(container.findAll("ul", {"class": "_1xgFaf"})[0].text)
            #     append product price for each
            pricing.append(container.findAll("div", {"class": "_30jeq3 _1_WHN1"})[0].text)
            print(container.findAll("div", {"class": "_30jeq3 _1_WHN1"})[0].text)
            #     append product rating for each
            rate =container.findAll("span", {"class": "_2_R_DZ"})[0].text.split()
            rating.append(rate[0]+" "+rate[1])
            print(rate[0]+" "+rate[1])

In [None]:
df = pd.DataFrame({"Product_Name":products, "Pricing":pricing, "Rating":rating, "Description":description})
df.to_csv("tv_data.csv")
tv_data = pd.read_csv("tv_data.csv")
tv_data.head()

# Wikipedia Scraping
Wikipedia is one of those largest platforms which provides almost every information for free. From your kindergarten till today, you must have visited this platform atleast once to get any information from school presentations to professional research, Wikipedia helps everybody.
Unlike other sources of information websites, Wikipedia has its API to scrape data from its articles. Python being a general-purpose programming language provides packages for almost every task. So we have a package named as wikipedia for Python which we can use to scrape Wikipedia articles using Python. To scrape useful information from Wikipedia, you need to install a package named as wikipedia, which can be easily 
installed using the pip command- pip install wikipedia.

In [16]:
# import wilipedia as wiki
import wikipedia as wiki
print(dir(wiki))
# wiki.__version__

['API_URL', 'BeautifulSoup', 'Decimal', 'DisambiguationError', 'HTTPTimeoutError', 'ODD_ERROR_MESSAGE', 'PageError', 'RATE_LIMIT', 'RATE_LIMIT_LAST_CALL', 'RATE_LIMIT_MIN_WAIT', 'RedirectError', 'USER_AGENT', 'WikipediaException', 'WikipediaPage', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', '__version__', 'cache', 'datetime', 'debug', 'donate', 'exceptions', 'geosearch', 'languages', 'page', 'random', 're', 'requests', 'search', 'set_lang', 'set_rate_limiting', 'set_user_agent', 'stdout_encode', 'suggest', 'summary', 'sys', 'time', 'timedelta', 'unicode_literals', 'util', 'wikipedia']


In [17]:
# To explain the use of this package, I will scrape information based on Python. So let's start with the task to scrape
# Wikipedia articles. The code below will get all the search suggestions of our input. In our case, it will return the 
# search suggestions of Python:
print(wiki.search("python"))

['Python', 'Python (programming language)', 'Monty Python', 'PYTHON', 'Ball python', 'Burmese python', 'Reticulated python', 'Colt Python', 'Python (missile)', 'History of Python']


In [25]:
# Now let's see will the search engine on Wikipedia suggest us python if we will type only some alphabets of its spelling:
print(wiki.suggest("Pyth"))
# print(help(wiki.suggest))

pith


In [28]:
# Yes, it works, now let's have a look how we can get the summary of an article on Wikipedia:
print(wiki.summary("Python (genus)"))

Python is a genus of constricting snakes in the Pythonidae family native to the tropics and subtropics of the Eastern Hemisphere.The name Python was proposed by François Marie Daudin in 1803 for non-venomous flecked snakes.
Currently, 10 python species are recognized as valid taxa.Three formerly considered python subspecies have been promoted, and a new species recognized.


In [31]:
# If you want to read the summary in another language other than English, we can also do that. I will get the same summary
# above in the French language:
wiki.set_lang("fr")
print(wiki.summary("Python"))

Python (prononcé /pi.tɔ̃/) est un langage de programmation interprété, multi-paradigme et multiplateformes. Il favorise la programmation impérative structurée, fonctionnelle et orientée objet. Il est doté d'un typage dynamique fort, d'une gestion automatique de la mémoire par ramasse-miettes et d'un système de gestion d'exceptions ; il est ainsi similaire à Perl, Ruby, Scheme, Smalltalk et Tcl.
Le langage Python est placé sous une licence libre proche de la licence BSD et fonctionne sur la plupart des plates-formes informatiques, des smartphones aux ordinateurs centraux, de Windows à Unix avec notamment GNU/Linux en passant par macOS, ou encore Android, iOS, et peut aussi être traduit en Java ou .NET. Il est conçu pour optimiser la productivité des programmeurs en offrant des outils de haut niveau et une syntaxe simple à utiliser.
Il est également apprécié par certains pédagogues qui y trouvent un langage où la syntaxe, clairement séparée des mécanismes de bas niveau, permet une initia

In [33]:
# Now let's change the language back to English and have a look at some more insights from the article. Here I will 
# scrape all the information we will get if we will read about python on Wikipedia:
wiki.set_lang("en")
p = wiki.page("Python (programming language)")
# To get the Title:
print(p.title)

Python (programming language)


In [34]:
# To get the url of the article:
print(p.url)

https://en.wikipedia.org/wiki/Python_(programming_language)


In [35]:
# To scrape the full article:
print(p.content)

Python is a high-level, interpreted, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation.Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming. It is often described as a "batteries included" language due to its comprehensive standard library.Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0. Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support. Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions. Python 2 was discontinued with version 2.7.18 in 2020.Python consistently ranks as one of the most popular progra

In [38]:
# To get all the images in the article:
print(p.images)

['https://upload.wikimedia.org/wikipedia/commons/3/31/Free_and_open-source_software_logo_%282009%29.svg', 'https://upload.wikimedia.org/wikipedia/commons/9/94/Guido_van_Rossum_OSCON_2006_cropped.png', 'https://upload.wikimedia.org/wikipedia/commons/6/6f/Octicons-terminal.svg', 'https://upload.wikimedia.org/wikipedia/commons/c/c3/Python-logo-notext.svg', 'https://upload.wikimedia.org/wikipedia/commons/1/10/Python_3._The_standard_type_hierarchy.png', 'https://upload.wikimedia.org/wikipedia/commons/b/bd/Python_Powered.png', 'https://upload.wikimedia.org/wikipedia/commons/d/df/Wikibooks-logo-en-noslogan.svg', 'https://upload.wikimedia.org/wikipedia/commons/f/fa/Wikibooks-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/f/ff/Wikidata-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/f/fa/Wikiquote-logo.svg', 'https://upload.wikimedia.org/wikipedia/commons/0/0b/Wikiversity_logo_2017.svg', 'https://upload.wikimedia.org/wikipedia/en/4/4a/Commons-logo.svg', 'https://upload.wi

In [39]:
# And to get all the referals used by Wikipedia in the article:
print(p.links)

['"Hello, World!" program', '3ds Max', '?:', 'ABC (programming language)', 'ADMB', 'ALGOL', 'ALGOL 68', 'APL (programming language)', 'Abaqus', 'Academic Free License', 'Academic conference', 'Action selection', 'Activation function', 'Ada (programming language)', 'Advanced Simulation Library', 'Adversarial machine learning', 'AlexNet', 'Alex Graves (computer scientist)', 'Alex Martelli', 'Algebra', 'AlphaFold', 'AlphaGo', 'AlphaZero', 'Alternative terms for free software', 'Amazon (company)', 'AmigaOS 4', 'Amoeba (operating system)', 'Anaconda (installer)', 'Analyse-it', 'Andrew Ng', 'Android (operating system)', 'Anonymous function', 'Apache Groovy', 'Apache License', 'Apache webserver', 'Aphorism', 'Apple M1', 'Apple Public Source License', 'ArXiv (identifier)', 'Arbitrary-precision arithmetic', 'ArcGIS', 'Arithmetic operations', 'Array index', 'Array slicing', 'Artificial intelligence', 'Artificial neural network', 'Artistic License', 'Aspect-oriented programming', 'Assembly langua

# Twitter Scraping
One of the hot topics in data science is social media analytics. People love these analyzes and interest them because everyone knows this world. Most of our time is spent on Twitter, Instagram, Facebook, and some other social media apps. The use of social media analysis is mostly used in the tasks of relationship analysis. With not only scraping twitter with python, but I will also do some relationship analysis based on our scrapped data.

Scraping Twitter with Python 
Now, let's start with our task. In this task of scraping twitter with python, we need to install a package known as twint, which can be easily installed by using the pip command in your terminal - pip install twint.

In [40]:
import twint
# print(dir(twint))
import pandas as pd
from collections import Counter

In [41]:
# After importing the necessary libraries, now we need to start by creating a user list consisting of Twitter accounts. 
# We will analyze the relationships between the Twitter accounts of these people that I will add in the list below:

users = [
    'Elonmusk'
    'Arsenal',
    'Leereports',
    'segalink',
]

In [42]:
# Scraping Twitter with Python and Analyzing Relationships
# Now let's start by scraping Twitter with python and to analyze the relationships between all the Twitter accounts in our
# list above, I'll write a function named get_followings which will send a request to the twint library with a username. 
# This function will return a list of users that our input user follows:

def get_followings(username):

    c = twint.Config()
    c.Username = username
    c.Pandas = True

    twint.run.Following(c)
    list_of_followings = twint.storage.panda.Follow_df

    return list_of_followings['following'][username]

In [43]:
# The for loop below will create two variables, as sometimes we get index error when Twitter does not respond to our 
# request. For such cases, I added an exception to the code to ignore these users:

followings = {}
following_list = []
for person in users:
    print('#####\nStarting: ' + person + '\n#####')
    try:
        followings[person] = get_followings(person)
        following_list = following_list + followings[person]
    except KeyError:
        print('IndexError')

#####
Starting: ElonmuskArsenal
#####


RuntimeError: This event loop is already running

In [42]:
# After getting all of the following lists, we can just calculate the most common values in the following_list variable 
# to get the most popular accounts among our users. To get the 10 most followed accounts, we will use the Counter function
# of the collection library:

Counter(following_list).most_common(10)

[]

In [43]:
# What if we want to see who's following who in our user group? To study it, I wrote a for loop that checks if anyone among 
# the users is in the following list of another person. As a result, it creates a list dictionary displaying the following
# states represented by True and False:

follow_relations ={}
for following_user in followings.keys():
    follow_relation_list = []
    for followed_user in followings.keys():
        if followed_user in followings[following_user]:
            follow_relation_list.append(True)
        else:
            follow_relation_list.append(False)
    follow_relations[following_user] = follow_relation_list

In [44]:
# In the code below, the resulting dictionary is transformed into a pandas dataframe for a more user-friendly 
# visualization. The rows of the dataframe show the users who follow, while the columns show the users who are followed:

following_df = pd.DataFrame.from_dict(follow_relations, 
                                      orient='index', columns=followings.keys())
following_df

  if method in B.__dict__:


# Instagram Scraping
First of all, if you are learning Data Science then scraping Instagram will help you in getting the new trends of businesses, so that you can generate more leads and can reach out for your new potential customers.
To scrap Instagram, we will use a library know as instaloader which provides us with an API for scraping Instagram. You can install this library by using the pip method in your terminal - pip install instaloader. Now If you have installed this package then let's get started with the task.

In [1]:
# Import the module
import instaloader

  readline_hook.enable(use_pyreadline=use_pyreadline)


In [3]:
# Create an instance of Instaloader class
bot = instaloader.Instaloader()

# Load a profile from an Instagram handle
profile = instaloader.Profile.from_username(bot.context, 'i.am_argentum')
print(type(profile))
print(profile)

<class 'instaloader.structures.Profile'>
<Profile i.am_argentum (1382778823)>


In [4]:
# Now let's see how we can extract some valuable information from an Instagram profile:

print("Username: ", profile.username)
print("User ID: ", profile.userid)
print("Number of Posts: ", profile.mediacount)
print("Followers: ", profile.followers)
print("Followees: ", profile.followees)
print("Bio: ", profile.biography,profile.external_url)

Username:  i.am_argentum
User ID:  1382778823
Number of Posts:  39
Followers:  231
Followees:  225
Bio:  Stakeholder @argentum_digital.wallet; convert digital funds to Naira🌏🌍
BTech Mechanical Engineering
Mechanic by training
Leader and Nation builder None


In [None]:
# Login with username and password in the script
username = input("Enter your username>>")
password = input("Enter your password>>")

bot.login(user=username,passwd=password)

In [None]:
# Interactive login on terminal
profile = bot.interactive_login("oluwapaul") # Asks for password in the terminal

In [None]:
# Scraping your followers and followees will help you in getting a list of their usernames, which you will require to do 
# when you will work in a professional environment in the data science field:

# Retrieve the usernames of all followers
followers = [follower.username for follower in profile.get_followers()]
print(followers)
print(len(followers))

In [None]:
# Retrieve the usernames of all followees
followees = [followee.username for followee in profile.get_followees()]
print(followees)

In [None]:
# Getting posts from any profile is easy in python. We just need to use get_posts(). I will use this method on the profile of
# someone else. To download each post, we need to loop over the generator object using .download_post() method. Now let's 
# go through this:

# Load a new profile
profile = instaloader.Profile.from_username(bot.context, 'arzoogarg')

# Get all posts in a generator object
posts = profile.get_posts()
posts

In [None]:
# Iterate and download
# It will save the post folder by creating its directory. In each folder, you will see the actual content of the posts of
# the profile like a video or images.

for index, post in enumerate(posts, 1):
    bot.download_post(post, target=f"{profile.username}_{index}")

# Youtube Scraping
For the task of scraping youtube with python, we have a package known as pytube3, which can be easily installed by using the pip command - pip install pytube3. Now let's get started with the task of scraping youtube by importing this package:

In [1]:
# You will notice that we have and installed pytube3 by using the pip command, but we are importing pytube and not pytube3.
# To eliminate confusion, pytube3 is also imported by writing only pytube. We don't import it by writing as pytube3.
from pytube import YouTube

In [2]:
# Our next step is to get the link of the youtube video which we need to scrape using python. Don't forget to place your 
# link between " " or ' ':

link = input("Enter Link of Youtube Video: ")
yt = YouTube(link)

Enter Link of Youtube Video: https://www.youtube.com/watch?v=_P-JRZgKz7E


In [3]:
# Now, as we have taken the link of a youtube video as an input, we will run the program while scraping some information about the video. Now, I will write some necessary functions that we can use for scraping youtube with python by using pytube 
# module:
# To print title
yt = YouTube(link)
print("Title :", yt.title)
# To get number of views
print("Views :", yt.views)
# To get the length of video
print("Duration :", yt.length)
# To get description
print("Description :", yt.description)
# To get ratings
print("Ratings : ", yt.rating)

Title : Nigerian PRAISE DRUMS
Views : 2865
Duration : 101
Description : 
Ratings :  None


In [4]:
# The task of scraping youtube with python will be left incomplete if we do not know how to download the youtube video. 
# It's obvious that we are more considered about the data, but still, if you want to see how we can download the youtube 
# video using python then you can use the code below:


# pip install git+https://github.com/baxterisme/pytube

stream = yt.streams.get_highest_resolution()
stream.download()
print("Download completed!!")

Download completed!!


In [1]:
name = [1,2,3,4,5,6]
age = [1,2,3,4,9,6]
data = []
for x , y in zip(name,age):
    mm = x * y
    data.append(mm)

data

[1, 4, 9, 16, 45, 36]