## Popularity Calculation

In [1]:
import nltk
from nltk import ngrams, FreqDist
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, webtext
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import groupby
from collections import Counter

In [2]:
stop = stopwords.words('english')
stop.extend(['thing','anything','much','something','My','We','my','we','lot','day','get','dealer','way'])
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/subhayuchakravarty/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv('posts.csv')
df.shape

(5164, 4)

In [4]:
df['comments'] = df['comments'].astype(str)
mask = [isinstance(item, (str)) for item in df['comments']]
df = df.loc[mask]
df['allcomments'] = df['comments']

In [5]:
# Car models to be replaced with Brand name
audis=['A3','A4','S4','S3']
acuras=['TLX','ILX','TL']

# Synonymous words for replacement
performwords=['powerful','power','mileage','speed','transmission']
stylewords=['look','looks','stylish','gorgeous','classy']
luxurywords=['comfort','convenience','high-end','luxurious','expensive','fancy','grand']
costwords=['costs','cheap','prices','pricing','economic']
bigwords=['big','spacious','space','room','huge','leg-space','bigger']

In [6]:
# Replacements
for audimodel in audis:
    df['comments'] = df['comments'].str.replace(audimodel, 'Audi')
for acuramodel in acuras:
    df['comments'] = df['comments'].str.replace(acuramodel, "Acura")
df['comments'] = df['comments'].str.replace('MB', "Mercedes")
df['comments'] = df['comments'].str.replace('VW', "Volkswagen")


for luxury in luxurywords:
    df['comments'] = df['comments'].str.replace(luxury, 'luxury')
for performance in performwords:
    df['comments'] = df['comments'].str.replace(performance, "performance")
for style in stylewords:
    df['comments'] = df['comments'].str.replace(style, "style")
for cost in costwords:
    df['comments'] = df['comments'].str.replace(cost, "price")
for big in bigwords:
    df['comments'] = df['comments'].str.replace(big, "space")

In [7]:
def removeStop(wordsList):
    return [i for i in wordsList if i not in stop]

In [8]:
tokenizer=RegexpTokenizer(r'\w+')
tagList=[]
for com in df['comments']:
    wordsList = tokenizer.tokenize(com)
    wordsList = removeStop(wordsList)
    tagged = nltk.pos_tag(wordsList) 
    tagList.append(tagged)
print(tagList[0])

[('still', 'RB'), ('Kia', 'NNP'), ('4', 'CD'), ('cars', 'NNS'), ('though', 'IN'), ('An', 'DT'), ('Audi', 'NNP'), ('Need', 'NNP'), ('schedule', 'NN'), ('return', 'NN'), ('miles', 'NNS'), ('promised', 'VBD'), ('drive', 'NN'), ('least', 'NN'), ('twice', 'JJ'), ('week', 'NN'), ('work', 'NN'), ('tomorrow', 'NN'), ('thurs', 'VBZ'), ('Probably', 'RB'), ('return', 'JJ'), ('weekend', 'NN'), ('Due', 'NNP'), ('date', 'NN'), ('Sept', 'VBD'), ('3rd', 'CD')]


In [9]:
wordSoup = [y for x in tagList for y in x]

In [10]:
nounList = [word for word in wordSoup if word[1]=='NNP']
adjectiveList = [word for word in wordSoup if (word[1]=='NN')]

nouncountList = Counter(nounList)
nouncountList.most_common(20)

[(('Audi', 'NNP'), 2605),
 (('Acura', 'NNP'), 1721),
 (('BMW', 'NNP'), 1414),
 (('AWD', 'NNP'), 444),
 (('Mercedes', 'NNP'), 414),
 (('Cadillac', 'NNP'), 366),
 (('ELLPS', 'NNP'), 285),
 (('Lexus', 'NNP'), 262),
 (('Sport', 'NNP'), 243),
 (('S', 'NNP'), 242),
 (('CTS', 'NNP'), 236),
 (('Accord', 'NNP'), 235),
 (('Infiniti', 'NNP'), 235),
 (('Honda', 'NNP'), 223),
 (('Volkswagen', 'NNP'), 221),
 (('A', 'NNP'), 216),
 (('So', 'NNP'), 193),
 (('C', 'NNP'), 184),
 (('MSRP', 'NNP'), 180),
 (('ATS', 'NNP'), 175)]

In [11]:
adjcountList = Counter(adjectiveList)
adjcountList.most_common(25)

[(('car', 'NN'), 2869),
 (('performance', 'NN'), 1425),
 (('price', 'NN'), 1042),
 (('time', 'NN'), 863),
 (('luxury', 'NN'), 646),
 (('space', 'NN'), 510),
 (('drive', 'NN'), 478),
 (('year', 'NN'), 471),
 (('engine', 'NN'), 455),
 (('series', 'NN'), 449),
 (('style', 'NN'), 419),
 (('money', 'NN'), 392),
 (('model', 'NN'), 374),
 (('course', 'NN'), 334),
 (('wife', 'NN'), 325),
 (('market', 'NN'), 320),
 (('point', 'NN'), 318),
 (('brand', 'NN'), 314),
 (('class', 'NN'), 301),
 (('cost', 'NN'), 295),
 (('sport', 'NN'), 267),
 (('lease', 'NN'), 265),
 (('today', 'NN'), 256),
 (('service', 'NN'), 255),
 (('vehicle', 'NN'), 242)]

In [21]:
brandList = ['Acura','Audi','BMW','Cadillac','Infiniti']
attributeList = ['luxury','performance','style','price','space']

In [22]:
comb_count=[]
car_count={}
attrib_count={}
for car in brandList:
    num_car=0
    for comment in df.allcomments:
            if car in comment:
                num_car+=1
    car_count[car]=num_car
    for attrib in attributeList:
        num_combined=0
        num_attrib=0
        for comment in df.allcomments:
            if car in comment and attrib in comment:
                num_combined+=1
            if attrib in comment:
                num_attrib+=1
        attrib_count[attrib]=num_attrib
        comb_count.append(((car,attrib),num_combined))

In [23]:
def calculate_lift(car, attrib, combined):
    lift = (5164*combined)/(car_count[car]*attrib_count[attrib])
    return lift

In [24]:
lift_scores={}
for k,num in comb_count:
    lift_scores[k[0]+','+k[1]] = calculate_lift(k[0],k[1], num)
lift_scores

{'Acura,luxury': 2.9135408560311284,
 'Acura,performance': 1.9177898550724637,
 'Acura,style': 2.993623188405797,
 'Acura,price': 2.0477931034482757,
 'Acura,space': 1.8074,
 'Audi,luxury': 2.2597424100859818,
 'Audi,performance': 2.587573234659266,
 'Audi,style': 2.04731069115898,
 'Audi,price': 2.0161303846556966,
 'Audi,space': 2.6683282674772038,
 'BMW,luxury': 1.7614331194097732,
 'BMW,performance': 1.8629148629148629,
 'BMW,style': 1.2959407742016438,
 'BMW,price': 1.8211598746081505,
 'BMW,space': 1.4530735930735932,
 'Cadillac,luxury': 3.6279723303069606,
 'Cadillac,performance': 2.4253891572732154,
 'Cadillac,style': 2.4253891572732154,
 'Cadillac,price': 2.638058748403576,
 'Cadillac,space': 3.3470370370370373,
 'Infiniti,luxury': 2.806338717040193,
 'Infiniti,performance': 2.8222006315278114,
 'Infiniti,style': 2.9267265808436562,
 'Infiniti,price': 1.8901175110768638,
 'Infiniti,space': 4.038882681564246}

In [25]:
sorted_lifts = sorted(lift_scores.items(), key=lambda kv: kv[1], reverse=True)
sorted_lifts

[('Infiniti,space', 4.038882681564246),
 ('Cadillac,luxury', 3.6279723303069606),
 ('Cadillac,space', 3.3470370370370373),
 ('Acura,style', 2.993623188405797),
 ('Infiniti,style', 2.9267265808436562),
 ('Acura,luxury', 2.9135408560311284),
 ('Infiniti,performance', 2.8222006315278114),
 ('Infiniti,luxury', 2.806338717040193),
 ('Audi,space', 2.6683282674772038),
 ('Cadillac,price', 2.638058748403576),
 ('Audi,performance', 2.587573234659266),
 ('Cadillac,performance', 2.4253891572732154),
 ('Cadillac,style', 2.4253891572732154),
 ('Audi,luxury', 2.2597424100859818),
 ('Acura,price', 2.0477931034482757),
 ('Audi,style', 2.04731069115898),
 ('Audi,price', 2.0161303846556966),
 ('Acura,performance', 1.9177898550724637),
 ('Infiniti,price', 1.8901175110768638),
 ('BMW,performance', 1.8629148629148629),
 ('BMW,price', 1.8211598746081505),
 ('Acura,space', 1.8074),
 ('BMW,luxury', 1.7614331194097732),
 ('BMW,space', 1.4530735930735932),
 ('BMW,style', 1.2959407742016438)]