# Recognizing Brand & Models

In [1]:
import pandas as pd

In [2]:
models = pd.read_csv('models.csv', header = None, names = ['Brand', 'Model'])

**Delete Duplicates**

In [3]:
models['Brand'] = models['Brand'].str.lower()
models['Model'] = models['Model'].str.lower()
models['Combined'] = (models['Brand'] + ' ' + models['Model'])
models['Combined'] = models['Combined'].str.replace('[\s\-,._]', '', regex = True)

unique_model = models.drop_duplicates(subset = 'Combined')
unique_model = unique_model.drop(columns = ['Combined'])

**Check Brand list**

In [4]:
brand = unique_model['Brand'].unique().tolist()
print(brand)

['acura', 'audi', 'bmw', 'buick', 'cadillac', 'car', 'chevrolet', 'chrysler', 'dodge', 'ford', 'honda', 'hyndai kia', 'hyundai', 'hyundai,', 'infiniti', 'kia', 'lincoln', 'mazda', 'mercedes', 'mercury', 'mitsubishi', 'nissan', 'nissan.', 'pontiac', 'problem', 'saturn', 'seat', 'sedan', 'subaru', 'suzuki', 'toyata', 'toyota', 'volkswagen', 'volkwagen', 'volvo']


**Notice weird brands:**

- _hyundai kia_, _car_, _problem_, _seat_ should be deleted
- _hyndai kia_ should be _hyundai_
- _hyundai,_ and _nissan._ should not have ',' and '.'.
- _toyata_ should be _toyota_
- _volkwagen_ should be _volkswagen_

In [5]:
new_model = unique_model[~unique_model['Brand'].isin(['hyundai kia', 'car', 'problem', 'seat'])]

new_model.loc[new_model['Brand'].str.contains('hyundai') & ~ new_model['Brand'].eq('hyundai'), 'Brand'] = 'hyundai'
new_model.loc[new_model['Brand'].str.contains('kia') & ~ new_model['Brand'].eq('kia'), 'Brand'] = 'kia'
new_model.loc[new_model['Brand'].str.contains('nissan') & ~ new_model['Brand'].eq('nissan'), 'Brand'] = 'nissan'
new_model.loc[new_model['Brand'].eq('toyata'), 'Brand'] = 'toyota'
new_model.loc[new_model['Brand'].eq('volkwagen'), 'Brand'] = 'volkswagen'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_model.loc[new_model['Brand'].str.contains('hyundai') & ~ new_model['Brand'].eq('hyundai'), 'Brand'] = 'hyundai'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_model.loc[new_model['Brand'].str.contains('kia') & ~ new_model['Brand'].eq('kia'), 'Brand'] = 'kia'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_model.loc[new_model['Brand'].str.contains('nissan') & ~ new_model['Brand'].eq('nissan'), 'Brand'] = 'nissan'
A value is trying to be set on a copy of a 

**Delete Dup Again**

In [6]:
new_model['Combined'] = (new_model['Brand'] + ' ' + new_model['Model'])
new_model['Combined'] = new_model['Combined'].str.replace('[\s\-,._]', '', regex = True)

new_model = new_model.drop_duplicates(subset = 'Combined')
new_model = new_model.drop(columns = ['Combined'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_model['Combined'] = (new_model['Brand'] + ' ' + new_model['Model'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_model['Combined'] = new_model['Combined'].str.replace('[\s\-,._]', '', regex = True)


**Check brand**

In [7]:
brand = new_model['Brand'].unique().tolist()
print(brand)

['acura', 'audi', 'bmw', 'buick', 'cadillac', 'chevrolet', 'chrysler', 'dodge', 'ford', 'honda', 'kia', 'hyundai', 'infiniti', 'lincoln', 'mazda', 'mercedes', 'mercury', 'mitsubishi', 'nissan', 'pontiac', 'saturn', 'sedan', 'subaru', 'suzuki', 'toyota', 'volkswagen', 'volvo']


**Check model**

In [8]:
print(len(new_model))
print(new_model['Model'].nunique())
duplicated_model = new_model[new_model.duplicated(subset = 'Model', keep = False)]
print("Rows with duplicate values in 'Model' column:")
print(duplicated_model)

481
478
Rows with duplicate values in 'Model' column:
       Brand    Model
1      acura   legend
44     buick  century
171    honda   legend
224  hyundai   matrix
420   toyota  century
444   toyota   matrix


**Modify model dup & create dictionary for models**

In [9]:
modify_rows = [1,44,171, 224, 420, 444]
new_model.loc[modify_rows, 'Model'] = new_model.loc[modify_rows, 'Brand'] + ' ' + new_model.loc[modify_rows, 'Model']
print(new_model['Model'].nunique())
model = new_model.set_index('Model')['Brand'].to_dict()

481


# Scrape Comments & Create CSV

**Note:** We still need to find the URL first

In [None]:
from bs4 import BeautifulSoup
import requests
import csv

entries = []
entry = []
urlnumber = 1

while urlnumber < 101:
    url = f'https://forums.edmunds.com/discussion/2864/general/x/entry-level-luxury-performance-sedans/p{urlnumber}'
    try:
        r = requests.get(url, timeout = 10) # Sending a request to access the page
    except Exception as e:
        print("Error message:",e)
        break;

    data = r.text
    
    soup = BeautifulSoup(data, 'lxml') # Getting the page source into the soup
    
    for div in soup.find_all('div'):
        entry = []
        if(div.get('class') != None and div.get('class')[0] == 'Comment'): # A single post is referred to as a comment. Each comment is a block denoted in a div tag which has a class called comment.
            ps = div.find_all('p') # gets all the tags called p to a variable ps
            aas = div.find_all('a') # gets all the tags called a to a variable aas
            spans = div.find_all('span')
            times = div.find_all('time') # used to extract the time tag which gives the iDate of the post

            concat_str = ''
            for str in aas[1].contents: # prints the contents that is between the tag start and end
                if str != "<br>" or str != "<br/>": # breaks in post which we need to work around
                    concat_str = (concat_str + ' '+ str).encode("utf-8").strip() # the format extracted is a unicode - we need a uniform structure to work with the strings
            entry.append(concat_str)

            concat_str = ''
            for str in times[0].contents:
                if str != "<br>" or str != "<br/>":
                    concat_str = (concat_str + ' '+ str).encode('iso-8859-1').strip()
            entry.append(concat_str)

            for div in div.find_all('div'):
                if (div.get('class') != None and div.get('class')[0] == 'Message'): # extracting the div tag with the class attribute as message
                    blockquotes = []
                    x = div.get_text()
                    for bl in div.find_all('blockquote'):
                        blockquotes.append(bl.get_text()) # block quote is used to get the quote made by a person. get_text helps to eliminate the hyperlinks and pulls out only the data.
                        bl.decompose()
                    # Encoding the text to ascii code by replacing the non-ascii characters
                    ascii_encoding = div.get_text().replace("\n"," ").replace("<br/>","").encode('ascii','replace')
                    # Convert the ASCII encoding to Latin1 encoding
                    latin1_encoding = ascii_encoding.decode('ascii').encode('iso-8859-1')
                    # Append the encoding bytes to output list
                    entry.append(latin1_encoding)

                    for bl in blockquotes:
                        ascii_encoding = bl.replace("\n"," ").replace("<br/>","").encode('ascii','replace')
                        latin1_encoding = ascii_encoding.decode('ascii').encode('iso-8859-1')
                        entry.append(latin1_encoding)

            entries.append(entry)
            
    urlnumber += 1

columns = ['User Name', 'Comment Date', 'Full Comment']

# Convert a list of byte to list a of string     
stringlist=[[x.decode('iso-8859-1') for x in entry] for entry in entries]
# Save the list to a csv file
with open('5KComments.csv', 'w') as output:
    writer = csv.writer(output, quoting=csv.QUOTE_ALL)
    writer.writerow(columns)
    writer.writerows(stringlist)

print ("Wrote to 5KComments.csv")

# To Do 1: Tokenization

# To Do 2: Find word similarity

**Question:** Should we focus only on those comments when they spelled the brand/model 100% correct? If so, we can ignore this step. If not, we need to determine whether they are having typos.

### Approach 1 - Simpler, but may have lower accuracy

In [10]:
import spacy
import spacy.cli
import en_core_web_sm

def jaccard_similarity(word1, word2):
    set1 = set(word1)
    set2 = set(word2)

    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))

    similarity = intersection / union
    return similarity

word1 = "Honda"
word2 = "honda"

similarity_percentage = jaccard_similarity(word1, word2) * 100
print(f"Jaccard Similarity: {similarity_percentage:.2f}%")


Jaccard Similarity: 66.67%


### Approach 2 - Have higher accuracy, but hard to determine the similarity threshold

In [11]:
import spacy
import spacy.cli
import en_core_web_sm

nlp = en_core_web_sm.load()

def word_similarity(word1, word2):
    doc1 = nlp(word1)
    doc2 = nlp(word2)

    similarity = doc1.similarity(doc2)
    return similarity

word1 = "Honda"
word2 = "honda"

similarity_percentage = word_similarity(word1, word2) * 100
print(f"Word Similarity: {similarity_percentage:.2f}%")

Word Similarity: 70.27%


  similarity = doc1.similarity(doc2)
