In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string
import re
import gc
from tqdm import tqdm  # For progress monitoring

In [2]:
# Set the display option to show the entire column value
pd.set_option('display.max_colwidth', None)

In [3]:
train_df = pd.read_csv('train.tsv', sep='\t')

In [4]:
train_df.head() 

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
0,0,MLB Cincinnati Reds T Shirt Size XL,3,Men/Tops/T-shirts,,10.0,1,No description yet
1,1,Razer BlackWidow Chroma Keyboard,3,Electronics/Computers & Tablets/Components & Parts,Razer,52.0,0,This keyboard is in great condition and works like it came out of the box. All of the ports are tested and work perfectly. The lights are customizable via the Razer Synapse app on your PC.
2,2,AVA-VIV Blouse,1,Women/Tops & Blouses/Blouse,Target,10.0,1,"Adorable top with a hint of lace and a key hole in the back! The pale pink is a 1X, and I also have a 3X available in white!"
3,3,Leather Horse Statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,New with tags. Leather horses. Retail for [rm] each. Stand about a foot high. They are being sold as a pair. Any questions please ask. Free shipping. Just got out of storage
4,4,24K GOLD plated rose,1,Women/Jewelry/Necklaces,,44.0,0,Complete with certificate of authenticity


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1482535 entries, 0 to 1482534
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   train_id           1482535 non-null  int64  
 1   name               1482535 non-null  object 
 2   item_condition_id  1482535 non-null  int64  
 3   category_name      1476208 non-null  object 
 4   brand_name         849853 non-null   object 
 5   price              1482535 non-null  float64
 6   shipping           1482535 non-null  int64  
 7   item_description   1482531 non-null  object 
dtypes: float64(1), int64(3), object(4)
memory usage: 90.5+ MB


In [6]:
train_df.describe()

Unnamed: 0,train_id,item_condition_id,price,shipping
count,1482535.0,1482535.0,1482535.0,1482535.0
mean,741267.0,1.90738,26.73752,0.4472744
std,427971.1,0.9031586,38.58607,0.4972124
min,0.0,1.0,0.0,0.0
25%,370633.5,1.0,10.0,0.0
50%,741267.0,2.0,17.0,0.0
75%,1111900.0,3.0,29.0,1.0
max,1482534.0,5.0,2009.0,1.0


In [7]:
train_df.duplicated().sum()

0

In [8]:
train_df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6327
brand_name           632682
price                     0
shipping                  0
item_description          4
dtype: int64

# 전처리

## 이상치 처리

In [9]:
# 이상치 처리 
train_df = train_df[train_df['price'] != 0]

# 이상치 처리 확인
train_df.sort_values(by='price', ascending=False)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description
760469,760469,NEW Chanel WOC Caviar Gold Hardware,1,Women/Women's Handbags/Shoulder Bag,Chanel,2009.0,0,"New with tag, box and dustbag but no receipt. Dark purple color Cross body shoulder bag Gold hardware You must purchase and pay for two listing for the total [rm] or your purchase will be cancelled. Smoke free and Pet free home. PRICE IS FIRM thank you"
1262245,1262245,NEW-Chanel Boy Wallet o Chain WOC Caviar,1,Women/Women's Handbags/Messenger & Crossbody,Chanel,2006.0,0,Brand: Chanel Style: WOC Boy Wallet on Chain Type: Crossbody shoulder bag Color: Black with ruthenium hardware Material: Caviar leather Condition: New with original packaging& copy of receipt Total price is [rm]. Must purchase and pay for both listings to be valid.
1393600,1393600,David Yurman Wheaton ring,2,Women/Jewelry/Rings,David Yurman,2004.0,0,David Yurman Wheaton wing. Size 6. Original receipt included (may have to black out some information) as well as David Yurman pouch and polishing cloth. Like new condition. Worn twice.
415027,415027,Chanel Classic Jumbo Single flap bag,3,Women/Women's Handbags/Shoulder Bag,Chanel,2000.0,1,"Authentic. Pre-loved in Excellent Condition. Pen mark on the bottom side of the bag but won't be noticed when worn, shown in the 2nd picture. Must be bought with the other listing, [rm]. Total of [rm]."
778940,778940,Mary kay,1,Beauty/Makeup/Face,Mary Kay,2000.0,1,30 time wise sets Oily to combo skin
...,...,...,...,...,...,...,...,...
1320473,1320473,☆ TATTOO CHOKER ☆,1,Women/Jewelry/Necklaces,,3.0,1,"LAST ONE LEFT !!!!! CUTE AND TRENDY PLASTIC TATTOO CHOKER ! ONLY [rm], VERY FEW IN STOCK! FREE SHIP FREE SHIPPING Shipping Details: ☆Tracking number is included and usually updates within 24hrs. SOMETIMES IT DOES NOT SCAN UNTIL IT REACHES YOUR CITY, PLEASE BE PATIENT. ☆Please allow 2-7 Business days. MAY TAKE LONGER DUE TO USPS. ☆I can bundle, comment on any of my posts. :) Happy Shopping!!! COMPARE TO: FOREVER 21, BRANDY MELVILLE, H&M, CLAIRES, ZARA, HOT TOPIC, CHARLOTTE RUSSE ♡SANDY LOVE SHOP♡"
1421106,1421106,3D ankle socks,1,Women/Athletic Apparel/Socks,,3.0,1,"Dream catcher Listing for active users, super cute , for adult and teenagers"
1320462,1320462,Blackheads removers free shipping!!,1,Beauty/Skin Care/Face,,3.0,1,2 blackheads removers
1468628,1468628,Hollywood Spaniel book,1,Other/Books/Children's Books,,3.0,1,Hollywood Spaniel softcover new. FREE SHIPPING


## 결측치 처리

In [10]:
train_df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name          6314
brand_name           632336
price                     0
shipping                  0
item_description          4
dtype: int64

### 'category_name' 결측치 삭제

In [11]:
train_df = train_df.dropna(subset=['category_name'])

### 'item_description' 유사 결측치 삭제

In [12]:
# 유사 결측치 확인 및 삭제

# Identify rows where 'item_description' contains 'no description'
no_description_mask = train_df['item_description'].str.contains('no description', case=False, na=False)

# Replace 'item_description' in those rows with NaN
train_df.loc[no_description_mask, 'item_description'] = np.nan

# Drop rows where 'item_description' is NaN
train_df = train_df.dropna(subset=['item_description'])

In [13]:
train_df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name             0
brand_name           591841
price                     0
shipping                  0
item_description          0
dtype: int64

In [14]:
train_df.shape

(1393289, 8)

### 'category_name' 자르기

In [15]:
# Function to split the category_name column and ensure it always returns 3 elements
def split(category_name):
    try:
        parts = category_name.split('/')
        # Ensure the returned list is always of length 3
        return (parts + ['Null', 'Null', 'Null'])[:3]
    except AttributeError:
        return ['Null', 'Null', 'Null']

category_1 = []
category_2 = []
category_3 = []

for i in range(train_df.shape[0]):
    temp = split(train_df['category_name'].iloc[i])  # train_df['category_name'][i] 대신 train_df['category_name'].iloc[i] 사용
    category_1.append(temp[0])
    category_2.append(temp[1])
    category_3.append(temp[2])

train_df['category_1'] = category_1
train_df['category_2'] = category_2
train_df['category_3'] = category_3

print('1st Category:\n', train_df['category_1'].value_counts())
print('2nd Category:', train_df['category_2'].nunique())
print('3rd Category:', train_df['category_3'].nunique())

1st Category:
 Women                     626532
Beauty                    199877
Kids                      160906
Electronics               115110
Men                        87837
Home                       62225
Vintage & Collectibles     44108
Other                      42801
Handmade                   29769
Sports & Outdoors          24124
Name: category_1, dtype: int64
2nd Category: 113
3rd Category: 869


### 텍스트 컬럼 전처리

In [16]:
# # Text column preprocessing

# text_cols = ['name', 'brand_name', 'item_description', 'category_1', 'category_2', 'category_3']

# def preprocess_text(text):
#     if isinstance(text, str):
#         # Convert to lowercase
#         text = text.lower()
#         # Remove punctuation
#         text = text.translate(str.maketrans('', '', string.punctuation))
#         # Remove non-alphanumeric characters (except spaces and accented characters)
#         text = re.sub(r'[^\w\s\u00C0-\u017F]', '', text)
#         # Remove extra whitespace
#         text = ' '.join(text.split())
#     return text

# # Initialize tqdm
# tqdm.pandas()

# # Apply preprocessing with progress bar
# for col in text_cols:
#     print(f"Processing column: {col}")
#     train_df[col] = train_df[col].progress_apply(preprocess_text)
#     print(f"Finished processing {col}")

# print("All text preprocessing completed!")

In [17]:
# Text column preprocessing

text_cols = ['name', 'brand_name', 'item_description', 'category_1', 'category_2', 'category_3']

def preprocess_text(text):
    if isinstance(text, str):
        # Convert to lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove non-alphanumeric characters (except spaces and accented characters)
        text = re.sub(r'[^\w\s\u00C0-\u017F]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
    return text

chunk_size = 5000  # Adjust based on your memory capacity
num_chunks = (len(train_df) // chunk_size) + 1

# Initialize tqdm
tqdm.pandas()

# Process the data in chunks
processed_chunks = []
for i in tqdm(range(num_chunks)):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(train_df))
    chunk = train_df.iloc[start:end].copy()

    # Apply preprocessing
    for col in text_cols:
        chunk[col] = chunk[col].progress_apply(preprocess_text)
    
    processed_chunks.append(chunk)
    
    # Explicitly delete chunk and call garbage collection to free memory
    del chunk
    gc.collect()

# Concatenate all chunks back into a single DataFrame
train_df = pd.concat(processed_chunks).reset_index(drop=True)

print("All text preprocessing completed!")

  0%|          | 0/279 [00:00<?, ?it/s]
100%|██████████| 5000/5000 [00:00<00:00, 144302.76it/s]

100%|██████████| 5000/5000 [00:00<00:00, 244988.67it/s]

100%|██████████| 5000/5000 [00:00<00:00, 63498.30it/s]

100%|██████████| 5000/5000 [00:00<00:00, 176687.08it/s]

100%|██████████| 5000/5000 [00:00<00:00, 142594.53it/s]

100%|██████████| 5000/5000 [00:00<00:00, 160649.60it/s]
  0%|          | 1/279 [00:00<02:08,  2.16it/s]
100%|██████████| 5000/5000 [00:00<00:00, 129550.59it/s]

100%|██████████| 5000/5000 [00:00<00:00, 241693.69it/s]

100%|██████████| 5000/5000 [00:00<00:00, 64653.28it/s]

100%|██████████| 5000/5000 [00:00<00:00, 187314.28it/s]

100%|██████████| 5000/5000 [00:00<00:00, 177932.84it/s]

100%|██████████| 5000/5000 [00:00<00:00, 172571.01it/s]
  1%|          | 2/279 [00:00<02:05,  2.21it/s]
100%|██████████| 5000/5000 [00:00<00:00, 138254.31it/s]

100%|██████████| 5000/5000 [00:00<00:00, 260813.85it/s]

100%|██████████| 5000/5000 [00:00<00:00, 64213.60it/s]

100%|█████████

All text preprocessing completed!


### 'brand_name' 결측치 채우기

In [18]:
# Create 'combined text' column by concatenating 'name' and 'item_description'
train_df['combined_text'] = train_df['name'] + ' ' + train_df['item_description']

# Strip leading/trailing spaces
train_df['combined_text'] = train_df['combined_text'].str.strip()

In [19]:
brand_null_head_idx = train_df[train_df['brand_name'].isnull()].head(50).index.tolist()
train_df[train_df['brand_name'].isnull()].head(50)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,category_3,combined_text
2,3,leather horse statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage,home,home décor,home décor accents,leather horse statues new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage
3,4,24k gold plated rose,1,Women/Jewelry/Necklaces,,44.0,0,complete with certificate of authenticity,women,jewelry,necklaces,24k gold plated rose complete with certificate of authenticity
4,5,bundled items requested for ruie,3,Women/Other/Other,,59.0,0,banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top,women,other,other,bundled items requested for ruie banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top
8,9,porcelain clown doll checker pants vtg,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,i realized his pants are on backwards after the picture they were very dirty so i hand washed them he has a stuffed body and painted porcelain head hands and feet back before clowns were too scary 9 tall no chips or cracks but minor paint loss in a few places clown circus doll collectible,vintage collectibles,collectibles,doll,porcelain clown doll checker pants vtg i realized his pants are on backwards after the picture they were very dirty so i hand washed them he has a stuffed body and painted porcelain head hands and feet back before clowns were too scary 9 tall no chips or cracks but minor paint loss in a few places clown circus doll collectible
15,17,new baby ktan active baby carrier,1,Kids/Gear/Backpacks & Carriers,,29.0,1,brand new in box size medium color coral retails for rm the baby ktan active is made of a breathable hitech performance fabric that wicks away moisture and sweat blocks over 90 of the suns harmful uva and uvb rays and provides a unique temperature control ergonomic positioning for healthy infant development evenly distributes weight across back and shoulders doubleloop design slips on like a tshirt,kids,gear,backpacks carriers,new baby ktan active baby carrier brand new in box size medium color coral retails for rm the baby ktan active is made of a breathable hitech performance fabric that wicks away moisture and sweat blocks over 90 of the suns harmful uva and uvb rays and provides a unique temperature control ergonomic positioning for healthy infant development evenly distributes weight across back and shoulders doubleloop design slips on like a tshirt
21,23,triple car charger,1,Electronics/Cell Phones & Accessories/Chargers & Cradles,,8.0,1,brand new never used all colors are available each only rm,electronics,cell phones accessories,chargers cradles,triple car charger brand new never used all colors are available each only rm
31,34,four puppy dog stuffed animal ty,3,Kids/Toys/Stuffed Animals & Plush,,5.0,1,2 beanie babies pugsley wrinkles puppy with pumpkin big dog retro pinup doll frilly skirted adorable barbie pinksuper cute fan martini cherry pinup anchor marabou kitschoure beasweetlollipopinaworldofsoursuckers from love in sunny san diego california us of a absolutely adorable soft genuine real bunny fox foxy loxy fur stunning and gorgeous bambi long eyelashes sweet cheeks blush pink juicy dollface candy colors girlfriend dance club baby panty present for the girl who has everything cupcake couture resin kawaii lolita dress up gift for lime crime pegasus unicorn flamingo swan princess kitty pinup rockabilly girls jewelry and wild fox lolita kawaii gypsy wedding festival edf electronic dance rave raver coachella party,kids,toys,stuffed animals plush,four puppy dog stuffed animal ty 2 beanie babies pugsley wrinkles puppy with pumpkin big dog retro pinup doll frilly skirted adorable barbie pinksuper cute fan martini cherry pinup anchor marabou kitschoure beasweetlollipopinaworldofsoursuckers from love in sunny san diego california us of a absolutely adorable soft genuine real bunny fox foxy loxy fur stunning and gorgeous bambi long eyelashes sweet cheeks blush pink juicy dollface candy colors girlfriend dance club baby panty present for the girl who has everything cupcake couture resin kawaii lolita dress up gift for lime crime pegasus unicorn flamingo swan princess kitty pinup rockabilly girls jewelry and wild fox lolita kawaii gypsy wedding festival edf electronic dance rave raver coachella party
32,35,black capri leggings w flowers one size,1,"Women/Athletic Apparel/Pants, Tights, Leggings",,13.0,1,one size fits sizes 212 92 polyester 8 spandex super soft capri leggings high waist 1in elastic waist band,women,athletic apparel,pants tights leggings,black capri leggings w flowers one size one size fits sizes 212 92 polyester 8 spandex super soft capri leggings high waist 1in elastic waist band
33,36,black bag 30,3,Women/Women's Handbags/Totes & Shoppers,,209.0,0,preowned some light scratches on hardware consistent with gentle use real togo leather comes with twilly,women,womens handbags,totes shoppers,black bag 30 preowned some light scratches on hardware consistent with gentle use real togo leather comes with twilly
39,42,lots of korean nature republic face mask,1,Beauty/Skin Care/Face,,14.0,1,totally 36 masks will be expired on feb,beauty,skin care,face,lots of korean nature republic face mask totally 36 masks will be expired on feb


In [20]:
brand_null_tail_idx = train_df[train_df['brand_name'].isnull()].tail(50).index.tolist()
train_df[train_df['brand_name'].isnull()].tail(50)

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,category_3,combined_text
1393168,1482407,slim extreme fat burner serum,1,Beauty/Skin Care/Body,,16.0,0,firming fighting cellulite sliming body contour brand new super effects,beauty,skin care,body,slim extreme fat burner serum firming fighting cellulite sliming body contour brand new super effects
1393171,1482410,gel memory foam lumbar pillow,1,Home/Bedding/Other,,18.0,0,gel memory foam lumbar pillow by modernhome brand new in box pillow was uniquely designed to offer twotiered support its dense contouring memory foam material provides just the right amount of allday support for your backs lumbar region the combination of advanced gel technology and breathable ventilated fabric helps resist heat to keep your back dry cool and comfortable includes one white zipper pillow case cover primary material memory foam overall dimensions 1325h by 125w,home,bedding,other,gel memory foam lumbar pillow gel memory foam lumbar pillow by modernhome brand new in box pillow was uniquely designed to offer twotiered support its dense contouring memory foam material provides just the right amount of allday support for your backs lumbar region the combination of advanced gel technology and breathable ventilated fabric helps resist heat to keep your back dry cool and comfortable includes one white zipper pillow case cover primary material memory foam overall dimensions 1325h by 125w
1393173,1482412,kylie jenner candy k,1,Beauty/Makeup/Lips,,6.0,1,brand new never used authentic fast shipping,beauty,makeup,lips,kylie jenner candy k brand new never used authentic fast shipping
1393174,1482413,lg pink hello kitty silicone baking mold,1,Home/Kitchen & Dining/Bakeware,,10.0,1,this is for a brand new pink hello kitty silicone baking mold it is perfect for baking candy chocolate ice and so much more made from food grade silicone it measures at about 75 inches by 63 inches and about 087 of an inch thick it can go up to a temperature of 410 degrees fahrenheit and down to 40 degrees fahrenheit there are 16 hello kitty spaces on the mold and each measure at about 125 inches by about 1 inch each hello kitty head is the same mold even the mold is shaped like hello kitty please note i have many molds and all are brand new and have never been used they will come in a poly or ziplock bag in a bubble mailer,home,kitchen dining,bakeware,lg pink hello kitty silicone baking mold this is for a brand new pink hello kitty silicone baking mold it is perfect for baking candy chocolate ice and so much more made from food grade silicone it measures at about 75 inches by 63 inches and about 087 of an inch thick it can go up to a temperature of 410 degrees fahrenheit and down to 40 degrees fahrenheit there are 16 hello kitty spaces on the mold and each measure at about 125 inches by about 1 inch each hello kitty head is the same mold even the mold is shaped like hello kitty please note i have many molds and all are brand new and have never been used they will come in a poly or ziplock bag in a bubble mailer
1393176,1482415,bellora queen duvet set,1,Home/Bedding/Duvet Covers & Sets,,19.0,0,bellora queen duvet set brand new in package white with tan trim comes with duvet cover and 2 queen shams will bundle purchases to save on shipping no free shipping,home,bedding,duvet covers sets,bellora queen duvet set bellora queen duvet set brand new in package white with tan trim comes with duvet cover and 2 queen shams will bundle purchases to save on shipping no free shipping
1393178,1482417,nwt boys shirts size 7,1,Kids/Boys (4+)/Top & T-shirts,,14.0,0,nwt boys size 7 3 boys shirts picture 1 is the front picture 2 is the back 1batman 2ninja turtles firm price smoke free pet free home 100 polyester,kids,boys 4,top tshirts,nwt boys shirts size 7 nwt boys size 7 3 boys shirts picture 1 is the front picture 2 is the back 1batman 2ninja turtles firm price smoke free pet free home 100 polyester
1393180,1482419,small blue lularoe joy,1,Women/Coats & Jackets/Vest,,40.0,1,slate blue lace vest brand new free shipping soft lace,women,coats jackets,vest,small blue lularoe joy slate blue lace vest brand new free shipping soft lace
1393185,1482424,marilyn monroe crochet lace 1pc swim,1,Women/Swimwear/One-Piece,,26.0,0,nwot sold out everywhere super sexy sweetheart neck removable strapshook missing on one end easy fix please review pics open back with hookandbar closure colorblock crochet lace knit construction solid moderate coverage fully lined,women,swimwear,onepiece,marilyn monroe crochet lace 1pc swim nwot sold out everywhere super sexy sweetheart neck removable strapshook missing on one end easy fix please review pics open back with hookandbar closure colorblock crochet lace knit construction solid moderate coverage fully lined
1393187,1482426,dream catcher aromatherapy necklace,1,Women/Jewelry/Necklaces,,7.0,1,this beautiful silver toned dream catcher aromatherapy necklace measures about 15 across it is brand new and still in the package it comes with one orange one light blue and one darker blue felt inserts you put a few drops of any aromatherapy oil on these felt pads and inhale throughout the day free shipping,women,jewelry,necklaces,dream catcher aromatherapy necklace this beautiful silver toned dream catcher aromatherapy necklace measures about 15 across it is brand new and still in the package it comes with one orange one light blue and one darker blue felt inserts you put a few drops of any aromatherapy oil on these felt pads and inhale throughout the day free shipping
1393190,1482429,84 holo pokemon cards,2,Vintage & Collectibles/Trading Cards/Animation,,26.0,1,kyurem slaking trainer defender crawdaunt murkrow zoroark trainer expshare energy special energy zorua throh pyroar growlithe litleo fennekin sigliyph treecko tangela pinsir pancham latias raikou torterra sawmpert audino palkia gyarados supporter flower shop lady rhyperior druddigon dragonair noivern articuno poliwrath axew probopass trainer mistys determination skrelp chespin golurk fraxure simisage axew totodile trainer super rod trainer manectric spirit link chatot bunnelby dragalge magcargo huntail trainer virbank city gym trainer ultra ball flygon raikou magezone stunfisk latios trainer magnetic storm trainer silver mirror solrock leavanny trainer root fossil lileep honedge sawk tynamo throh chespin talonflame reshiram zapdos yanmega trainer first ticket trainer weakness policy spheal avalugg phione zoroark goomy articuno salamence phanpy reshiram pidgey virizion torchic primeape trainer exp share swampert,vintage collectibles,trading cards,animation,84 holo pokemon cards kyurem slaking trainer defender crawdaunt murkrow zoroark trainer expshare energy special energy zorua throh pyroar growlithe litleo fennekin sigliyph treecko tangela pinsir pancham latias raikou torterra sawmpert audino palkia gyarados supporter flower shop lady rhyperior druddigon dragonair noivern articuno poliwrath axew probopass trainer mistys determination skrelp chespin golurk fraxure simisage axew totodile trainer super rod trainer manectric spirit link chatot bunnelby dragalge magcargo huntail trainer virbank city gym trainer ultra ball flygon raikou magezone stunfisk latios trainer magnetic storm trainer silver mirror solrock leavanny trainer root fossil lileep honedge sawk tynamo throh chespin talonflame reshiram zapdos yanmega trainer first ticket trainer weakness policy spheal avalugg phione zoroark goomy articuno salamence phanpy reshiram pidgey virizion torchic primeape trainer exp share swampert


In [21]:
#  Count occurrences of each `brand_name` by `category_1`
brand_cat_counts_df = train_df.groupby(['brand_name', 'category_1']).size()
brand_cat_counts_df = pd.DataFrame(brand_cat_counts_df).reset_index()
brand_cat_counts_df.columns = ['brand_name', 'category_1', 'count'] 
brand_cat_counts_df = brand_cat_counts_df.sort_values(by='count', ascending=False)
brand_cat_counts_df

Unnamed: 0,brand_name,category_1,count
4994,pink,women,50017
6749,victorias secret,women,37603
3887,lularoe,women,27885
4557,nike,women,23115
389,apple,electronics,16444
...,...,...,...
5236,rails,women,1
5238,rainbow shops,kids,1
2568,georgia boot,men,1
2566,george foreman,men,1


In [22]:
# Filter for brands with counts > threshold 
threshold = 10
brands_df = brand_cat_counts_df[brand_cat_counts_df['count'] > threshold]
brands_df

Unnamed: 0,brand_name,category_1,count
4994,pink,women,50017
6749,victorias secret,women,37603
3887,lularoe,women,27885
4557,nike,women,23115
389,apple,electronics,16444
...,...,...,...
1502,coach,beauty,11
2428,frankie b,women,11
5770,shimano fishing,sports outdoors,11
2671,gorjana,women,11


In [23]:
# List of irrelevant brand names to remove
irrelevant_brands = ['all', 'm', 'ring']

# Remove rows where 'brand_name' is in the irrelevant brands list
brands_df = brands_df[~brands_df['brand_name'].isin(irrelevant_brands)]

In [24]:
# Group by 'category_1' and aggregate 'brand_name' into lists
brand_cat_dict = brands_df.groupby('category_1')['brand_name'].apply(list).to_dict()

In [25]:
# # Function to find and update the brand based on `category_1`
# def update_brand(row, brand_cat_dict):
#     if pd.isnull(row['brand_name']) or row['brand_name'] == 'all':
#         combined_text = row['combined_text']
#         category = row['category_1']
#         if category in brand_cat_dict:
#             # Check for each brand in the category
#             for brand in brand_cat_dict[category]:
#                 if re.search(r'\b' + re.escape(brand) + r'\b', combined_text):
#                     return brand
#     return row['brand_name']

# # Apply the function to each row in the DataFrame with progress bar
# tqdm.pandas(desc="Updating brands")
# train_df['brand_name'] = train_df.progress_apply(lambda row: update_brand(row, brand_cat_dict), axis=1)

In [26]:
# Sort the brand names in each category by length in descending order
for category in brand_cat_dict:
    brand_cat_dict[category] = sorted(brand_cat_dict[category], key=len, reverse=True)

# Function to find and update the brand based on `category_1`
def update_brand(row, brand_cat_dict):
    if pd.isnull(row['brand_name']) or row['brand_name'] == 'all':
        combined_text = row['combined_text']
        category = row['category_1']
        if category in brand_cat_dict:
            # Check for each brand in the category (longest first due to sorting)
            for brand in brand_cat_dict[category]:
                # Use word boundary to ensure exact match
                if re.search(r'\b' + re.escape(brand) + r'\b', combined_text):
                    return brand
    return row['brand_name']

chunk_size = 5000  # Adjust based on your memory capacity
num_chunks = (len(train_df) // chunk_size) + 1

# Initialize tqdm
tqdm.pandas()

# Process the data in chunks
processed_chunks = []
for i in tqdm(range(num_chunks)):
    start = i * chunk_size
    end = min((i + 1) * chunk_size, len(train_df))
    chunk = train_df.iloc[start:end].copy()
    
    # Apply the update_brand function to each row in the chunk
    chunk['brand_name'] = chunk.progress_apply(lambda row: update_brand(row, brand_cat_dict), axis=1)
    
    processed_chunks.append(chunk)
    
    # Explicitly delete chunk and call garbage collection to free memory
    del chunk
    gc.collect()

# Concatenate all chunks back into a single DataFrame
train_df = pd.concat(processed_chunks).reset_index(drop=True)

print("Brand updating completed!")

  0%|          | 0/279 [00:00<?, ?it/s]
  0%|          | 0/5000 [00:00<?, ?it/s][A
  1%|          | 34/5000 [00:00<00:18, 262.89it/s][A
  1%|          | 61/5000 [00:00<00:22, 218.67it/s][A
  2%|▏         | 84/5000 [00:00<00:30, 160.03it/s][A
  2%|▏         | 102/5000 [00:00<00:31, 153.91it/s][A
  2%|▎         | 125/5000 [00:00<00:30, 160.47it/s][A
  3%|▎         | 150/5000 [00:00<00:29, 167.13it/s][A
  3%|▎         | 173/5000 [00:01<00:28, 171.40it/s][A
  4%|▍         | 195/5000 [00:01<00:28, 168.15it/s][A
  4%|▍         | 222/5000 [00:01<00:25, 190.81it/s][A
  5%|▍         | 242/5000 [00:01<00:26, 182.42it/s][A
  5%|▌         | 263/5000 [00:01<00:25, 185.04it/s][A
  6%|▌         | 282/5000 [00:01<00:27, 170.42it/s][A
  6%|▌         | 300/5000 [00:01<00:33, 141.99it/s][A
  6%|▋         | 322/5000 [00:01<00:29, 158.50it/s][A
  7%|▋         | 348/5000 [00:02<00:26, 175.90it/s][A
  7%|▋         | 367/5000 [00:02<00:30, 150.34it/s][A
  8%|▊         | 399/5000 [00:02<00:25,

Brand updating completed!


In [27]:
# 'pink' > 'victorias secret' (if applicable)

# Define a function to check and update the brand name
def update_brand_pink(row):
    if row['brand_name'] == 'pink':
        if 'victorias secret' in row['combined_text'] or 'vs' in row['combined_text']:
            return 'victorias secret'
    return row['brand_name']

# Apply the function to each row in the dataframe using .loc to avoid SettingWithCopyWarning
train_df.loc[:, 'brand_name'] = train_df.apply(update_brand_pink, axis=1)

In [28]:
# 'm' > 'motorola' (if applicable)

m_mask = train_df['brand_name'] == 'm'

for idx in tqdm(train_df[m_mask].index):
    if (('motorola' in train_df.loc[idx, 'name']) or 
        ('motorola' in train_df.loc[idx, 'item_description'])):
        train_df.loc[idx, 'brand_name'] = 'motorola'

100%|██████████| 94/94 [00:00<00:00, 112.93it/s]


In [29]:
# 'galaxy' to 'samsung' (if applicable)

galaxy_mask = train_df['brand_name'] == 'galaxy'

# Steps 2 and 3: For those rows, check 'name' and 'item_description', and update if necessary
for idx in tqdm(train_df[galaxy_mask].index):
    if (('samsung' in train_df.loc[idx, 'name']) or 
        ('samsung' in train_df.loc[idx, 'item_description'])):
        train_df.loc[idx, 'brand_name'] = 'samsung'

100%|██████████| 60/60 [00:01<00:00, 54.46it/s]


In [32]:
train_df.iloc[brand_null_head_idx]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,category_3,combined_text
2,3,leather horse statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage,home,home décor,home décor accents,leather horse statues new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage
3,4,24k gold plated rose,1,Women/Jewelry/Necklaces,,44.0,0,complete with certificate of authenticity,women,jewelry,necklaces,24k gold plated rose complete with certificate of authenticity
4,5,bundled items requested for ruie,3,Women/Other/Other,banana republic,59.0,0,banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top,women,other,other,bundled items requested for ruie banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top
8,9,porcelain clown doll checker pants vtg,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,i realized his pants are on backwards after the picture they were very dirty so i hand washed them he has a stuffed body and painted porcelain head hands and feet back before clowns were too scary 9 tall no chips or cracks but minor paint loss in a few places clown circus doll collectible,vintage collectibles,collectibles,doll,porcelain clown doll checker pants vtg i realized his pants are on backwards after the picture they were very dirty so i hand washed them he has a stuffed body and painted porcelain head hands and feet back before clowns were too scary 9 tall no chips or cracks but minor paint loss in a few places clown circus doll collectible
15,17,new baby ktan active baby carrier,1,Kids/Gear/Backpacks & Carriers,,29.0,1,brand new in box size medium color coral retails for rm the baby ktan active is made of a breathable hitech performance fabric that wicks away moisture and sweat blocks over 90 of the suns harmful uva and uvb rays and provides a unique temperature control ergonomic positioning for healthy infant development evenly distributes weight across back and shoulders doubleloop design slips on like a tshirt,kids,gear,backpacks carriers,new baby ktan active baby carrier brand new in box size medium color coral retails for rm the baby ktan active is made of a breathable hitech performance fabric that wicks away moisture and sweat blocks over 90 of the suns harmful uva and uvb rays and provides a unique temperature control ergonomic positioning for healthy infant development evenly distributes weight across back and shoulders doubleloop design slips on like a tshirt
21,23,triple car charger,1,Electronics/Cell Phones & Accessories/Chargers & Cradles,,8.0,1,brand new never used all colors are available each only rm,electronics,cell phones accessories,chargers cradles,triple car charger brand new never used all colors are available each only rm
31,34,four puppy dog stuffed animal ty,3,Kids/Toys/Stuffed Animals & Plush,barbie,5.0,1,2 beanie babies pugsley wrinkles puppy with pumpkin big dog retro pinup doll frilly skirted adorable barbie pinksuper cute fan martini cherry pinup anchor marabou kitschoure beasweetlollipopinaworldofsoursuckers from love in sunny san diego california us of a absolutely adorable soft genuine real bunny fox foxy loxy fur stunning and gorgeous bambi long eyelashes sweet cheeks blush pink juicy dollface candy colors girlfriend dance club baby panty present for the girl who has everything cupcake couture resin kawaii lolita dress up gift for lime crime pegasus unicorn flamingo swan princess kitty pinup rockabilly girls jewelry and wild fox lolita kawaii gypsy wedding festival edf electronic dance rave raver coachella party,kids,toys,stuffed animals plush,four puppy dog stuffed animal ty 2 beanie babies pugsley wrinkles puppy with pumpkin big dog retro pinup doll frilly skirted adorable barbie pinksuper cute fan martini cherry pinup anchor marabou kitschoure beasweetlollipopinaworldofsoursuckers from love in sunny san diego california us of a absolutely adorable soft genuine real bunny fox foxy loxy fur stunning and gorgeous bambi long eyelashes sweet cheeks blush pink juicy dollface candy colors girlfriend dance club baby panty present for the girl who has everything cupcake couture resin kawaii lolita dress up gift for lime crime pegasus unicorn flamingo swan princess kitty pinup rockabilly girls jewelry and wild fox lolita kawaii gypsy wedding festival edf electronic dance rave raver coachella party
32,35,black capri leggings w flowers one size,1,"Women/Athletic Apparel/Pants, Tights, Leggings",,13.0,1,one size fits sizes 212 92 polyester 8 spandex super soft capri leggings high waist 1in elastic waist band,women,athletic apparel,pants tights leggings,black capri leggings w flowers one size one size fits sizes 212 92 polyester 8 spandex super soft capri leggings high waist 1in elastic waist band
33,36,black bag 30,3,Women/Women's Handbags/Totes & Shoppers,,209.0,0,preowned some light scratches on hardware consistent with gentle use real togo leather comes with twilly,women,womens handbags,totes shoppers,black bag 30 preowned some light scratches on hardware consistent with gentle use real togo leather comes with twilly
39,42,lots of korean nature republic face mask,1,Beauty/Skin Care/Face,,14.0,1,totally 36 masks will be expired on feb,beauty,skin care,face,lots of korean nature republic face mask totally 36 masks will be expired on feb


In [31]:
train_df.iloc[brand_null_head_idx]

Unnamed: 0,train_id,name,item_condition_id,category_name,brand_name,price,shipping,item_description,category_1,category_2,category_3,combined_text
2,3,leather horse statues,1,Home/Home Décor/Home Décor Accents,,35.0,1,new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage,home,home décor,home décor accents,leather horse statues new with tags leather horses retail for rm each stand about a foot high they are being sold as a pair any questions please ask free shipping just got out of storage
3,4,24k gold plated rose,1,Women/Jewelry/Necklaces,,44.0,0,complete with certificate of authenticity,women,jewelry,necklaces,24k gold plated rose complete with certificate of authenticity
4,5,bundled items requested for ruie,3,Women/Other/Other,banana republic,59.0,0,banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top,women,other,other,bundled items requested for ruie banana republic bottoms candies skirt with matching blazeramy byers suit loft bottoms and cami top
8,9,porcelain clown doll checker pants vtg,3,Vintage & Collectibles/Collectibles/Doll,,8.0,0,i realized his pants are on backwards after the picture they were very dirty so i hand washed them he has a stuffed body and painted porcelain head hands and feet back before clowns were too scary 9 tall no chips or cracks but minor paint loss in a few places clown circus doll collectible,vintage collectibles,collectibles,doll,porcelain clown doll checker pants vtg i realized his pants are on backwards after the picture they were very dirty so i hand washed them he has a stuffed body and painted porcelain head hands and feet back before clowns were too scary 9 tall no chips or cracks but minor paint loss in a few places clown circus doll collectible
15,17,new baby ktan active baby carrier,1,Kids/Gear/Backpacks & Carriers,,29.0,1,brand new in box size medium color coral retails for rm the baby ktan active is made of a breathable hitech performance fabric that wicks away moisture and sweat blocks over 90 of the suns harmful uva and uvb rays and provides a unique temperature control ergonomic positioning for healthy infant development evenly distributes weight across back and shoulders doubleloop design slips on like a tshirt,kids,gear,backpacks carriers,new baby ktan active baby carrier brand new in box size medium color coral retails for rm the baby ktan active is made of a breathable hitech performance fabric that wicks away moisture and sweat blocks over 90 of the suns harmful uva and uvb rays and provides a unique temperature control ergonomic positioning for healthy infant development evenly distributes weight across back and shoulders doubleloop design slips on like a tshirt
21,23,triple car charger,1,Electronics/Cell Phones & Accessories/Chargers & Cradles,,8.0,1,brand new never used all colors are available each only rm,electronics,cell phones accessories,chargers cradles,triple car charger brand new never used all colors are available each only rm
31,34,four puppy dog stuffed animal ty,3,Kids/Toys/Stuffed Animals & Plush,barbie,5.0,1,2 beanie babies pugsley wrinkles puppy with pumpkin big dog retro pinup doll frilly skirted adorable barbie pinksuper cute fan martini cherry pinup anchor marabou kitschoure beasweetlollipopinaworldofsoursuckers from love in sunny san diego california us of a absolutely adorable soft genuine real bunny fox foxy loxy fur stunning and gorgeous bambi long eyelashes sweet cheeks blush pink juicy dollface candy colors girlfriend dance club baby panty present for the girl who has everything cupcake couture resin kawaii lolita dress up gift for lime crime pegasus unicorn flamingo swan princess kitty pinup rockabilly girls jewelry and wild fox lolita kawaii gypsy wedding festival edf electronic dance rave raver coachella party,kids,toys,stuffed animals plush,four puppy dog stuffed animal ty 2 beanie babies pugsley wrinkles puppy with pumpkin big dog retro pinup doll frilly skirted adorable barbie pinksuper cute fan martini cherry pinup anchor marabou kitschoure beasweetlollipopinaworldofsoursuckers from love in sunny san diego california us of a absolutely adorable soft genuine real bunny fox foxy loxy fur stunning and gorgeous bambi long eyelashes sweet cheeks blush pink juicy dollface candy colors girlfriend dance club baby panty present for the girl who has everything cupcake couture resin kawaii lolita dress up gift for lime crime pegasus unicorn flamingo swan princess kitty pinup rockabilly girls jewelry and wild fox lolita kawaii gypsy wedding festival edf electronic dance rave raver coachella party
32,35,black capri leggings w flowers one size,1,"Women/Athletic Apparel/Pants, Tights, Leggings",,13.0,1,one size fits sizes 212 92 polyester 8 spandex super soft capri leggings high waist 1in elastic waist band,women,athletic apparel,pants tights leggings,black capri leggings w flowers one size one size fits sizes 212 92 polyester 8 spandex super soft capri leggings high waist 1in elastic waist band
33,36,black bag 30,3,Women/Women's Handbags/Totes & Shoppers,,209.0,0,preowned some light scratches on hardware consistent with gentle use real togo leather comes with twilly,women,womens handbags,totes shoppers,black bag 30 preowned some light scratches on hardware consistent with gentle use real togo leather comes with twilly
39,42,lots of korean nature republic face mask,1,Beauty/Skin Care/Face,,14.0,1,totally 36 masks will be expired on feb,beauty,skin care,face,lots of korean nature republic face mask totally 36 masks will be expired on feb


In [33]:
train_df.isnull().sum()

train_id                  0
name                      0
item_condition_id         0
category_name             0
brand_name           399141
price                     0
shipping                  0
item_description          0
category_1                0
category_2                0
category_3                0
combined_text             0
dtype: int64

In [None]:
# # 결측치 처리 

# train_df['brand_name'] = train_df['brand_name'].fillna(value='Unknown')

In [34]:
train_df['brand_name'].value_counts()

victorias secret    84578
lularoe             60135
nike                53004
pink                38857
apple               19723
                    ...  
olook                   1
usaf                    1
ramones                 1
arcona                  1
plugg                   1
Name: brand_name, Length: 4710, dtype: int64

In [35]:
# Save the DataFrame to a CSV file
train_df.to_csv('train_df_preprocessed.csv', index=False)

# 시각화

In [None]:
# # 카테고리 1 빈도 시각화
# category_1_counts = train_df['category_1'].value_counts()

# plt.figure(figsize=(8, 8))
# plt.pie(category_1_counts, labels=category_1_counts.index, autopct='%1.1f%%', startangle=140)
# plt.title('Distribution of Category 1')
# plt.axis('equal')  # 원형 차트를 원형으로 유지
# plt.show()

# # 카테고리 2 빈도 시각화
# category_2_counts = train_df['category_2'].value_counts()

# plt.figure(figsize=(8, 8))
# plt.pie(category_2_counts, labels=category_2_counts.index, autopct='%1.1f%%', startangle=140)
# plt.title('Distribution of Category 2')
# plt.axis('equal')
# plt.show()

# # 카테고리 3 빈도 시각화
# category_3_counts = train_df['category_3'].value_counts()

# plt.figure(figsize=(8, 8))
# plt.pie(category_3_counts, labels=category_3_counts.index, autopct='%1.1f%%', startangle=140)
# plt.title('Distribution of Category 3')
# plt.axis('equal')
# plt.show()

In [None]:
# # 카테고리별 평균 가격 계산
# category_1_avg_price = train_df.groupby('category_1')['price'].mean()
# category_2_avg_price = train_df.groupby('category_2')['price'].mean()
# category_3_avg_price = train_df.groupby('category_3')['price'].mean()

# # 시각화 - 막대 그래프

# # 카테고리 1 평균 가격 시각화
# plt.figure(figsize=(12, 6))
# category_1_avg_price.plot(kind='bar', color='blue')
# plt.title('Average Price by Category 1')
# plt.xlabel('Category 1')
# plt.ylabel('Average Price')
# plt.xticks(rotation=45)
# plt.show()

# # 카테고리 2 평균 가격 시각화
# plt.figure(figsize=(12, 6))
# category_2_avg_price.plot(kind='bar', color='green')
# plt.title('Average Price by Category 2')
# plt.xlabel('Category 2')
# plt.ylabel('Average Price')
# plt.xticks(rotation=90)
# plt.show()

# # 카테고리 3 평균 가격 시각화
# plt.figure(figsize=(12, 6))
# category_3_avg_price.plot(kind='bar', color='orange')
# plt.title('Average Price by Category 3')
# plt.xlabel('Category 3')
# plt.ylabel('Average Price')
# plt.xticks(rotation=90)
# plt.show()

In [None]:
# category_counts = train_df['category_name'].value_counts()

# top_30_categories = category_counts.head(30)

# print("Top 30 categories by frequency:")
# print(top_30_categories)

In [None]:
# category1_counts = train_df['category_1'].value_counts()

# category1_top = category1_counts

# print("Category_1 by frequency:")
# print(category1_top)

In [None]:
# category2_counts = train_df['category_2'].value_counts()

# category2_top_30 = category2_counts.head(30)

# print("Category_2 top 30 by frequency:")
# print(category2_top_30)

In [None]:
train_df['category_1+2'] = train_df['category_1'] + '/' + train_df['category_2']
train_df.head()

In [None]:
# category12_counts = train_df['category_1+2'].value_counts()

# category12_top_30 = category12_counts.head(30)

# print("Category_1+2 top 30 by frequency:")
# print(category12_top_30)

In [None]:
plt.figure(figsize=(18, 12))
sns.boxplot(data=train_df, x='category_1', y='price')
plt.show()

In [None]:
# # 카테고리별 평균 가격 계산
# category_1_avg_price = train_df.groupby('category_1')['price'].mean()
# category_2_avg_price = train_df.groupby('category_2')['price'].mean()
# category_3_avg_price = train_df.groupby('category_3')['price'].mean()

# # 시각화 - 막대 그래프

# # 카테고리 1 평균 가격 시각화
# plt.figure(figsize=(12, 6))
# category_1_avg_price.plot(kind='bar', color='blue')
# plt.title('Average Price by Category 1')
# plt.xlabel('Category 1')
# plt.ylabel('Average Price')
# plt.xticks(rotation=45)
# plt.show()

# # 카테고리 2 평균 가격 시각화
# plt.figure(figsize=(12, 6))
# category_2_avg_price.plot(kind='bar', color='green')
# plt.title('Average Price by Category 2')
# plt.xlabel('Category 2')
# plt.ylabel('Average Price')
# plt.xticks(rotation=90)
# plt.show()

# # 카테고리 3 평균 가격 시각화
# plt.figure(figsize=(12, 6))
# category_3_avg_price.plot(kind='bar', color='orange')
# plt.title('Average Price by Category 3')
# plt.xlabel('Category 3')
# plt.ylabel('Average Price')
# plt.xticks(rotation=90)
# plt.show()

# 텍스트 처리: Bag of Words

In [None]:
# Download NLTK resources (run this only once)
nltk.download('stopwords')

In [None]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Tokenize and remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    # Stem words
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)

In [None]:
def combined_bag_of_words(df, text_columns, max_features=1000):
    # Combine the specified text columns into one column for each row
    df['combined_text'] = df[text_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)
    
    # Preprocess the combined text
    df['combined_text'] = df['combined_text'].apply(preprocess_text)
    
    # Initialize CountVectorizer with max features
    vectorizer = CountVectorizer(binary=True, max_features=max_features)  # Limit vocabulary size
    X = vectorizer.fit_transform(df['combined_text'])
    
    # Convert the sparse matrix to a DataFrame
    bag_of_words_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
    
    # Drop the temporary combined column
    df.drop('combined_text', axis=1, inplace=True)
    
    return bag_of_words_df

# Apply the function to the 'name' and 'item_description' columns with a max_features limit
bow_df = combined_bag_of_words(train_df, ['name', 'item_description'], max_features=1000)

print(bow_df)

In [None]:
from wordcloud import WordCloud

# Frequency analysis
word_counts = combined_bag_of_words.sum(axis=0).sort_values(ascending=False)
print(word_counts.head(10))

# Visualization: Word Cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Visualization: Bar Plot
top_words = word_counts.head(20)
top_words.plot(kind='bar', figsize=(10, 5))
plt.title('Top 20 Words')
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.show()