In [13]:
import pandas as pd
from collections import Counter
import nltk
from nltk.corpus import stopwords
import numpy as np

In [20]:
df=pd.read_csv('edmunds.csv')
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rachana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
stop = stopwords.words('english')
def remove_stopwords(text):
    words = nltk.word_tokenize(text)
    stop_words = set(stopwords.words("english"))
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)
df['body']=df['body'].str.lower()

df['body'] = df['body'].astype(str).apply(remove_stopwords)
print(df['body'])

0       braking - sorry 70 0 braking 189 reported c & ...
1       new 2004 accord drove driveway last night . go...
2       love numbers , compare performance price numbe...
3       kd , people buy tl accord , reason bought 330 ...
4       pg48477 ... prove point . luxury primary crite...
                              ...                        
4995    `` meaningless '' guess 's meaningless actuall...
4996    guess everyone hung whole msrp value thing . g...
4997    please stop yelling ! consider used camaro z28...
4998    response exepected ... discounting areas bmw e...
4999    `` please mountain , curvy , wavy road nonsens...
Name: body, Length: 5000, dtype: object


In [22]:
# Map the brand names to the car names
mapping_df = pd.read_csv('car_models_and_brands.csv')
model_to_brand = dict(zip(mapping_df['Model'], mapping_df['Brand']))
def replace_model_with_brand(comment):
    for model, brand in model_to_brand.items():
        comment = comment.replace(model, brand)
    return comment

df['body'] = df['body'].str.lower().apply(replace_model_with_brand)

In [23]:
brands=mapping_df['Brand'].unique()
values_to_remove = ['car', 'seat', 'problem','"hyundai,"','hyundai.','"kia,"','kia.']

brands = [x for x in brands if x not in values_to_remove]

In [25]:
brand_freq = {}

# Iterate through the "body" column of the target DataFrame
for _, row in df.iterrows():
    body_text = row['body']
    # Check for NaN values and skip them
    if not isinstance(body_text, str) and np.isnan(body_text):
        continue
    
    # Split the "body" text into words
    words = body_text.split()

    words = list(set(words))

    # Count the frequency of brand names in the "body" text
    for word in words:
        if word in brands:
            brand_freq[word] = brand_freq.get(word, 0) + 1

# Now, brand_freq dictionary contains the frequency counts of brand names in the "body" column
print(brand_freq)

{'bmw': 1433, 'honda': 426, 'acura': 544, 'nissan': 293, 'subaru': 217, 'infiniti': 399, 'sedan': 672, 'hyundai': 109, 'chevrolet': 106, 'toyota': 321, 'pontiac': 52, 'mercedes-benz': 205, 'audi': 413, 'ford': 128, 'mazda': 69, 'lincoln': 55, 'cadillac': 107, 'chrysler': 62, 'dodge': 55, 'volkswagen': 169, 'volvo': 122, 'mitsubishi': 22, 'saturn': 11, 'kia': 23, 'suzuki': 17, 'buick': 29, 'mercury': 5}


In [27]:
sorted_dict = sorted(brand_freq.items(), key=lambda item: item[1],reverse=True)
top_10_brands=sorted_dict[:10]
for brand, frequency in top_10_brands:
    print(f'{brand}: {frequency}')
top_10_list = [item[0] for item in top_10_brands]

bmw: 1433
sedan: 672
acura: 544
honda: 426
audi: 413
infiniti: 399
toyota: 321
nissan: 293
subaru: 217
mercedes-benz: 205


In [31]:
# Initialize a dictionary to store brand mentions per post
brand_mentions_per_post = {brand: set() for brand in top_10_list}

# Initialize dictionaries to store brand co-mentions and individual brand mentions
co_mentions = {}
for brand1 in top_10_list:
    co_mentions[brand1] = {}
    for brand2 in top_10_list:
        co_mentions[brand1][brand2] = 0
individual_mentions = {brand: 0 for brand in top_10_list}

# Function to calculate lift ratio between two brands
def calculate_lift(brand1, brand2, total_posts):
    if brand1 == brand2:
        return 0  # Lift ratio between the same brand is 0
    # Calculate lift using the formula: lift(brand1, brand2) = (P(brand1 and brand2) / (P(brand1) * P(brand2))) * N
    p_brand1_and_brand2 = co_mentions[brand1][brand2]
    p_brand1 = individual_mentions[brand1]
    p_brand2 = individual_mentions[brand2]
    if p_brand1 == 0 or p_brand2 == 0:
        return 0  
    else:
        return (p_brand1_and_brand2 / (p_brand1 * p_brand2)) * total_posts

# Iterate through the "body" column of the target DataFrame
for _, row in df.iterrows():
    body_text = row['body']
    
    # Check for NaN values and skip them
    if not isinstance(body_text, str) and np.isnan(body_text):
        continue
    
    # Split the "body" text into words
    words = body_text.split()
    
    # Convert words to lowercase for case-insensitive matching
    words = [word.lower() for word in words]
    
    # Keep track of previously mentioned brands in the current post
    mentioned_brands_in_post = set()
    
    for i, word in enumerate(words):
        if word in top_10_list:
            # Check if the brand was already mentioned in the current post
            if word not in mentioned_brands_in_post:
                # Increment individual mention count for the brand
                individual_mentions[word] += 1
                # Update the set of mentioned brands in the current post
                mentioned_brands_in_post.add(word)
                
                # Check for co-mentions of other brands in the same post
                for j in range(i + 1, min(i + 6, len(words))):  # Change '6' to your desired separation limit
                    if words[j] in top_10_list:
                        # Increment co-mention count for the pair of brands (both directions)
                        co_mentions[word][words[j]] += 1
                        co_mentions[words[j]][word] += 1  # Symmetric update

# Calculate total number of posts
total_posts = len(df)

lift_ratios = {}
lift_already_calculated = set()  # To keep track of already calculated lift pairs

for brand1 in top_10_list:
    for brand2 in top_10_list:
        if brand1 != brand2:
            # Ensure we calculate lift only once for each pair (ignoring the order)
            pair = tuple(sorted([brand1, brand2]))
            if pair not in lift_already_calculated:
                lift_ratio = calculate_lift(brand1, brand2, total_posts)
                lift_ratios[pair] = lift_ratio
                lift_already_calculated.add(pair)

# Sort lift ratios in descending order
sorted_lift_ratios = sorted(lift_ratios.items(), key=lambda x: x[1], reverse=True)

# Print the top 10 unique lift ratios and associated brand pairs
for (brand1, brand2), lift_ratio in sorted_lift_ratios[:10]:
    print(f'Lift({brand1}, {brand2}) = {lift_ratio:.2f}')


Lift(honda, toyota) = 2.41
Lift(nissan, toyota) = 1.97
Lift(honda, nissan) = 1.84
Lift(acura, infiniti) = 1.45
Lift(audi, mercedes-benz) = 1.18
Lift(acura, audi) = 0.85
Lift(acura, honda) = 0.73
Lift(audi, infiniti) = 0.73
Lift(audi, bmw) = 0.65
Lift(bmw, infiniti) = 0.64
