# Let's start from using beautifulsoup for parsing static website

### Example 1: IMDb Movies Rating

![Alt Text](imdb.png)

In [None]:
# Import necessary packages
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.ticker import FuncFormatter

In [None]:
# Send a GET request to the IMDb Top 250 page
url = 'https://www.imdb.com/chart/top/'
response = requests.get(url)

# Parse the HTML response with BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')
type(soup)

In [None]:
# Find all movie rows
movie_rows = soup.select('tbody.lister-list tr')

# Initialize empty lists to store the extracted data
titles = []
ratings = []
years = []

# Extract data from each movie row
for row in movie_rows:
    title = row.find('td', class_='titleColumn').find('a').text
    rating = row.find('td', class_='ratingColumn').strong.text
    year = row.find('td', class_='titleColumn').span.text.strip('()')    
    titles.append(title)
    ratings.append(rating)
    years.append(year)

# Create a Pandas DataFrame from the extracted data
data = {
    'Title': titles,
    'Rating': ratings,
    'Year': years
}
df = pd.DataFrame(data)
df['Rating'] = df['Rating'].astype(np.float32)
# Print the DataFrame
df

#### Since the data is relatively simple here, we just do visualization exercise:

In [None]:
# Select the top 10 movies
top_10_df = df.head(10)

# Set custom style using Seaborn
sns.set_style("whitegrid")

# Define custom colors
colors = ['#004C9A', '#44AEEF', '#00798C', '#00B2A9', '#FF9F1C', '#FF6500', '#FF2E63', '#91328C', '#F47321', '#009CDA']

# Set up the plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(x='Rating', y='Title', data=top_10_df, palette=colors)


# Customize the plot
ax.set_xlabel('Rating', fontsize=12, fontweight='bold', fontfamily='Arial')
ax.set_ylabel('Movie Title', fontsize=12, fontweight='bold', fontfamily='Arial')
ax.set_title('Top 10 Movies - IMDb Ratings', fontsize=14, fontweight='bold', fontfamily='Arial')


# Add data labels to the bars
for i, row in enumerate(top_10_df.itertuples()):
    ax.text(row.Rating, i, f"{row.Rating:.1f}", va='center', fontsize=10, fontweight='bold', color='white')

# Remove spines
sns.despine(left=True, bottom=True)

# Customize tick labels
ax.xaxis.set_major_formatter(FuncFormatter(lambda x, _: f'{x:.1f}'))

# Adjust the spacing
plt.tight_layout()

# Display the plot
plt.show()

#### <span style="color:darkred">*Exercise*:
For the top 250 movies, groupby year and visualize the time series of average scores.
</span>

In [None]:
## fill your code here

### Example2: AI keywords in Wikipedia

![Alt Text](wiki.png)

In [None]:
import requests
from bs4 import BeautifulSoup
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import re

# Define the URL of the Wikipedia page
url = "https://en.wikipedia.org/wiki/Artificial_intelligence"

# Send a GET request to the website
response = requests.get(url)

# Parse the website's content using BeautifulSoup
soup = BeautifulSoup(response.text, 'html.parser')

# Get the text content of the website
text = soup.get_text()

# Use regex to find only the words
words = re.findall('\w+', text)

# Convert the list of words into a single string
text = ' '.join(words)

# Create a set of stopwords
stopwords = set(STOPWORDS)

# You can add more stopwords to the set
stopwords.update(["the", "and", "is", "in", "to", "of", "AI", "an", "as", "that", "for"])

# Create a WordCloud object
wordcloud = WordCloud(stopwords=stopwords, 
                      max_words=100, 
                      background_color='white', 
                      contour_width=3, 
                      contour_color='steelblue',
                      colormap='viridis',
                      width=800,
                      height=400).generate(text)

# Plot the WordCloud image                        
plt.figure(figsize=(10,5), facecolor='white')
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.tight_layout(pad=0)
plt.show()


---
# Use selenium for dynamic parsing

Before we start
1. Please make sure you have downloaded chromedriver for your corresponding chrome version and OS:

    [Download chromedriver here: https://chromedriver.chromium.org/downloads](https://chromedriver.chromium.org/downloads)

2. Please install selenium, you can use either conda or pip for installation:
```sh
pip install selenium
```
    or
```sh
conda install selenium
```


In [None]:
import requests
import time
import re
import pandas as pd
import numpy as np

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from datetime import datetime, timedelta

In [None]:
# create an instance of the Options class, which allows you to set various options for Chrome WebDriver. 
chrome_options = Options()
## run in headless mode. Headless mode means that the browser is run without a graphical user interface - so it doesn't open a visible window. This is useful for running automated tests or scripts on a server or other environment where no display is available or desired.
# chrome_options.add_argument("--headless")  
# image loading will be disabled in the Chrome browser (2 stands for blocking the images from being loaded).
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
# Set the path to the ChromeDriver, like this format in Mac:
# chrome_driver_path = "/Users/xxx/Documents/Codes/chromedriver"
# Or this format as in Windows:
chrome_driver_path = "C:\\Projects\\chromedriver_win32\\chromedriver.exe"
# chrome_driver_path = "your-own-directory-of-chromedriver"

### Our target of data analysis today:

As a car-renting platform, we would like to do data analysis of the current preowned vehicle market to 
* Determine the price range and distribution of the cars on current second-hand market
* Use relevant information to determine the insurance policy and price the contracts

Craigslist is a privately-held American company operating a classified advertisements website with sections devoted to jobs, housing, for sale, items wanted, services, community service, gigs, résumés, and discussion forums.
And here today we would like to parse this website to get the price of pre-owned vehicles.


![Alt Text](cl.png)

In [None]:
# Clean Price format. E.g. $8,000 -> 8000
def clean_price(raw_price: str) -> int:
    return int(''.join(char for char in raw_price if char.isdigit()))

# Parse date "30 minutes ago"-> today's date; 03/14 -> 2023/03/14
def parse_date(date_str: str) -> datetime.date:
    today = datetime.now()
    if 'ago' in date_str:
        return datetime.today().date()

    if '/' in date_str:
        month, day = map(int, date_str.split('/'))
        date_obj = datetime(today.year, month, day).date()
        return date_obj

    return None

# Extract car model year from model description
def extract_model_year(model_str: str)->int:
    pattern = r'\b[12]\d{3}\b'
    results = re.findall(pattern, model_str)
    if results:
        return int(results[0])
    else:
        return 0
    
# Extract car info from a list of tags
def extract_tag_info(tags:list)->dict:
    tag_info = {"condition": "", "odometer": -1, "type": "", "transmission":""}
    for tag in tags:
        key_and_value = tag.split(":")
        if len(key_and_value)<2:
            continue
        key_tag = key_and_value[0].strip()
        tag_value = key_and_value[-1].strip()
        if key_tag in tag_info:
            tag_info[key_tag] = tag_value
    tag_info["odometer"] = int(tag_info["odometer"])
    return tag_info

# Extract car manufacturer from listing title
def extract_manufacturer(title: str)-> str:
    title = title.lower()
    common_car_brands = ['ford', 'chevrolet', 'toyota', 'honda', 'nissan', 'jeep', 'subaru', 
                         'hyundai', 'kia', 'gmc', 'ram', 'volkswagen', 'bmw', 'mercedes-benz', 'lexus', 
                         'audi', 'mazda', 'dodge', 'buick', 'cadillac', 'acura', 'infiniti', 'volvo', 'lincoln', 
                         'mitsubishi', 'porsche', 'mini', 'tesla', 'jaguar', 'land rover', 'alfa romeo', 'genesis']
    for brand in common_car_brands:
        if brand in title:
            return brand
    return ""


In [None]:
def scrapeCarListing(page: int):
    global chrome_options, chrome_driver_path
    all_data = []
    driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
    url = f"https://newyork.craigslist.org/search/cta#search=1~gallery~{page}~0"
    driver.get(url)
    time.sleep(1)
    cards = driver.find_elements_by_css_selector("div.gallery-card")
    for card in cards:
        title = card.find_element_by_css_selector("a.titlestring").text
        raw_price = card.find_elements_by_css_selector(".priceinfo")
        if raw_price:
            price = clean_price(raw_price[0].text)
        else:
            price = 0
        meta_data = card.find_element_by_css_selector("div.meta").text.split("·")
        post_date = parse_date(meta_data[0])
        link = card.find_element_by_css_selector("a.titlestring").get_attribute("href").strip()
        d = {"title": title, "price": price, "post_date": post_date, "link":link}
        all_data.append(d)
    df = pd.DataFrame(all_data)
    driver.close()
    return df

def scrapeCarDetails(link: str):
    global chrome_options, chrome_driver_path
    all_data = []
    driver = webdriver.Chrome(executable_path=chrome_driver_path, options=chrome_options)
    driver.get(link)
    time.sleep(1)
    tags = driver.find_element_by_css_selector("body > section > section > section > div.mapAndAttrs").text.split("\n")
    car_details = extract_tag_info(tags)
    driver.close()
    return car_details

In [None]:
page = 0
df = scrapeCarListing(page)
df["car_details"] = df["link"].apply(scrapeCarDetails)
expand_details = pd.DataFrame(df['car_details'].tolist())
df = pd.concat([df.drop("car_details", axis=1), expand_details], axis=1)
df["model_year"] = df["title"].apply(extract_model_year)
df["manufacturer"] = df["title"].apply(extract_manufacturer)

In [None]:
df

In [None]:
# df.to_csv("car_example.csv")

### Data Summary and Cleaning

In [None]:
df = pd.read_csv("car_example.csv", index_col=0)
df.head()

In [None]:
df.info()

In [None]:
df.describe()

#### Requirements:

Before we continue, we need to do several pre-processing of the data:
- Fill empty condition with string 'unknown'
- Fill empty type with string 'unknown'
- Fill empty manufacturer with string 'unknown'

#### <span style="color:darkred">*Exercise*:
</span>

- Calculate the mean odometer and price for each car's manufactor, and put together with the full data
- Fill model year by average ages

In [None]:
# Drop Duplicates
df.drop_duplicates(inplace=True)

# Fill NaN
df['condition'].fillna('unknown', inplace=True)
df['type'].fillna('unknown', inplace=True)
df['manufacturer'].fillna('unknown', inplace=True)


In [None]:
## Fill your code here:


### Data Visualization

In [None]:
df.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#### Visualize Price vs. Odometer by Scatter Plot

In [None]:
plt.figure(figsize=(10, 6))
# sns.scatterplot(data=df, x='odometer', y='price')
scatter = sns.scatterplot(data=df, x='odometer', y='price', hue='ages', edgecolor=None)
# Customize the plot
scatter.set_title('Price vs Odometer Reading by Manufacturing Year', fontsize=16, fontweight='bold')
scatter.set_xlabel('Odometer Reading', fontsize=12, fontweight='bold')
scatter.set_ylabel('Price', fontsize=12, fontweight='bold')

# Add a legend outside the plot at position (1.02, 0)
plt.legend(bbox_to_anchor=(1.02, 0), loc='lower left', borderaxespad=0)

# Remove top and right spines
sns.despine(top=True, right=True)

plt.tight_layout()
plt.show()


#### Filter of Outliers

Here we can see few outliers, we can use **IQR  (Interquartile Range) method** to filter out the outliers: 

In [None]:
# Calculate the IQR of the price column
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1

# Define a range for non-outliers
non_outlier_range = (df['price'] >= Q1 - 1.5 * IQR) & (df['price'] <= Q3 + 1.5 * IQR)
# Only keep rows in this range
_df = df.loc[non_outlier_range]

In [None]:
# Set a white grid style for the plot
sns.set_style("whitegrid")

# Set custom color palette
sns.set_palette("Paired")

plt.figure(figsize=(10, 6))
scatter = sns.scatterplot(data=_df, x='odometer', y='price', hue='ages', edgecolor=None)
# Customize the plot
scatter.set_title('Price vs Odometer Reading by Manufacturing Year', fontsize=16, fontweight='bold')
scatter.set_xlabel('Odometer Reading', fontsize=12, fontweight='bold')
scatter.set_ylabel('Price', fontsize=12, fontweight='bold')

# Add a legend outside the plot at position (1.02, 0)
plt.legend(bbox_to_anchor=(1.02, 0), loc='lower left', borderaxespad=0)

# Remove top and right spines
sns.despine(top=True, right=True)

plt.tight_layout()
plt.show()

#### <span style="color:darkred">*Exercise*:
Visualize Average Price vs. Average Odometer by Scatter Plot
</span>

In [None]:
## fill your code here

#### Visualize car type vs. price by barchart:

In [None]:
average_price_by_type = df.groupby('type')['price'].mean().sort_values(ascending=False).reset_index()

# Set the style and palette
sns.set_style("whitegrid")
sns.set_palette("Blues")

# Create a larger figure
plt.figure(figsize=(14, 8))

# Create the bar plot
barplot = sns.barplot(data=average_price_by_type, x='type', y='price', edgecolor=None)

# Set a title and labels with larger, bold text
barplot.set_title('Average Price by Car Type', fontsize=18, fontweight='bold')
barplot.set_xlabel('Car Type', fontsize=14, fontweight='bold')
barplot.set_ylabel('Average Price ($)', fontsize=14, fontweight='bold')

# Rotate x-axis labels for better visibility
plt.xticks(rotation=45, ha='right', fontsize=12)

# Remove top and right spines
sns.despine(top=True, right=True)

# Add data labels on top of the bars
for i, row in average_price_by_type.iterrows():
    barplot.text(i,  row.price, f"$ {round(row.price)}", ha='center', va='bottom', color='black', fontsize=12)

plt.tight_layout()
plt.show()


### <span style="color:darkred">*Exercise*:
Visualize manufacturer vs. price by box plot:
</span>

In [None]:
## fill your code here

### <span style="color:darkred">*Exercise*:
Visualize the correlation matrix of ['price', 'odometer', 'model_year']:
</span>

In [None]:
## fill your code here

## Except basic statistical analysis and visualization, what else can we do?

Typical Machine-Learning project has following steps:

##### 1. Get data ready
##### 2. Preprocessing the data
##### 3. Visualization to understand the data
##### <span style="color:darkred"> 4. Feature engineering (if exist non-numeric value)
</span>

##### <span style="color:darkred"> 5. Train, tune and evaluate the model
</span>

### Example 1. Use Linear Regression to Predict Car Price

[Reference: Linear regression in Scikit-Learn](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['odometer', 'model_year']
categorical_features = ['condition', 'type', 'transmission', 'manufacturer']

# Preprocessing categorical features using OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

df = df.loc[df['price']>0]
X = df[numeric_features + categorical_features]
y = df['price']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Preprocessing the input features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Creating a linear regression model and fitting it on the training data
reg = LinearRegression()
reg.fit(X_train_processed, y_train)

# Predicting the target variable for the test dataset
y_pred = reg.predict(X_test_processed)

# Evaluating the model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean squared error:", mse)
print("Root mean squared error:", rmse)
print("R2 score:", r2)

### Example 2. Use XGBoost to Predict Car Price

[Reference: Xgboost](https://xgboost.readthedocs.io/en/stable/)

In [None]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb

# Selecting relevant numeric and categorical features
numeric_features = ['odometer', 'model_year']
categorical_features = ['condition', 'type', 'transmission', 'manufacturer']

# Preprocessing categorical features using OneHotEncoder
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

df = df.loc[df['price']>0]
X = df[numeric_features + categorical_features]
y = df['price']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

# Preprocessing the input features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Creating an XGBoost model and fitting it on the training data
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=3, learning_rate=0.01, random_state=10)
xgb_model.fit(X_train_processed, y_train)

# Predicting the target variable for the test dataset
y_pred = xgb_model.predict(X_test_processed)

# Evaluating the model performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean squared error:", mse)
print("Root mean squared error:", rmse)
print("R2 score:", r2)