In [1]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Import libraries
import requests
import pandas as pd
import csv
import re
import string
import nltk

from bs4 import BeautifulSoup
from numpy import NAN
from nltk.stem import WordNetLemmatizer

In [3]:
# Define a user agent to qualify request
user_agent = ({"User-Agent":
                    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
                    AppleWebKit/537.36 (KHTML, like Gecko) \
                    Chrome/90.0.4430.212 Safari/537.36",
               "Accept-Language": "en-US, en;q=0.5"})

In [4]:
# Read in the csv file with wikipedia links for each observation
wk = pd.read_csv("/content/drive/MyDrive/Priceline/JintongYu/wiki_links.csv", usecols=['city_name', 'wiki_link'])

In [5]:
wk.head()

Unnamed: 0,city_name,wiki_link
0,ROME,https://en.wikipedia.org/wiki/Rome
1,BARCELONA,https://en.wikipedia.org/wiki/Barcelona
2,ISTANBUL,https://en.wikipedia.org/wiki/Istanbul
3,MILAN,https://en.wikipedia.org/wiki/Milan
4,FLORENCE,https://en.wikipedia.org/wiki/Florence


In [6]:
wk_ls = wk["wiki_link"].tolist()

In [7]:
# Define a function to extract raw text data from a Wikipedia page
def scrape_wiki(link):
  try:
    # Send an HTTP GET request to the link and retrieve the HTML content
    response = requests.get(link, headers=user_agent)
    html_content = response.content
    
    # Parse the HTML content
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract text data
    text = ''
    for paragraph in soup.find_all('p'):
      text += paragraph.text
    
    # Create a dictionary for storing text
    return{
        "paragraph": text
    }
  except:
    return{
        "paragraph": NAN
    }

In [8]:
# Trial 1
#link = "https://en.wikipedia.org/wiki/Rome"
#scrape_wiki(link)

In [9]:
# Loop through all Wikipedia links for extracting text data
results = []
for link in wk_ls:
  results.append(scrape_wiki(link))

In [10]:
# Convert the dictionary to a data frame
wk_df = pd.DataFrame.from_dict(results)

In [11]:
# For removing stopwords in text
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
# Define a function for text cleaning that can be applied for a column in a data frame
def clean_text(df, column):
    """
    Preprocesses a text column in a pandas DataFrame.
    """
    # Convert to lowercase
    df[column] = df[column].str.lower()
    
    # Remove punctuation
    df[column] = df[column].str.replace('[{}]'.format(string.punctuation), '')
    
    # Remove numbers
    df[column] = df[column].str.replace('\d+', '')
    
    # Remove selected words --- this step is supposed to be modified for different use cases
    df[column] = df[column].str.replace('[^\x01-\x7F]', '') # Remove non-English characters

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df[column] = df[column].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))

    # Strip whitespace
    df[column] = df[column].str.strip()

    # Remove stop words
    nltk.download('stopwords')
    stop_words = set(nltk.corpus.stopwords.words('english'))
    df[column] = df[column].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    
    return df

In [13]:
clean_text(wk_df, "paragraph")

  df[column] = df[column].str.replace('[{}]'.format(string.punctuation), '')
  df[column] = df[column].str.replace('\d+', '')
  df[column] = df[column].str.replace('[^\x01-\x7F]', '')
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,paragraph
0,caput mundilatinthe capital world rome italian...
1,barcelona brslon barslohn catalanbslon spanish...
2,istanbul stnbl istanbuul u also stnbl istanbuu...
3,milan mln milan u also mln milahn lombard mil ...
4,florence flrns florrnss italian firenze firnts...
...,...
175,seoul sol koreansul listen litcapital official...
176,busan korean pronunciationpusan officially kno...
177,tokyo tokio japanese tky toko listen officiall...
178,kyoto kjoto japanese kyto koto listen official...


In [16]:
# Save to a csv file
wk_df.to_csv("/content/drive/MyDrive/Priceline/JintongYu/wiki_text.csv")