In [13]:
import pandas as pd
from googletrans import Translator

In [14]:
porsche_insta_df = pd.read_csv("./data/porsche_raw_instagram_dataset.csv")
bmw_insta_df = pd.read_csv("./data/bmw_raw_instagram_dataset.csv")
mercedes_insta_df = pd.read_csv("./data/mercedes_raw_instagram_dataset.csv")

In [15]:
print(porsche_insta_df.shape)
print(bmw_insta_df.shape)
print(mercedes_insta_df.shape)

(1268, 54)
(1000, 30)
(1000, 58)


In [16]:
bmw_insta_df.head()

Unnamed: 0,postUrl,description,commentCount,likeCount,pubDate,isSidecar,type,caption,profileUrl,username,...,taggedFullName2,taggedUsername2,likedByViewer,fullName,videoDuration,playCount,taggedFullName3,taggedUsername3,taggedFullName4,taggedUsername4
0,https://www.instagram.com/p/Cyi54MaMxlQ/,Last call! Our Oktoberfest West is happening i...,1,113,2023-10-18T15:35:08.000Z,False,Photo,Photo by BMW Performance Driving School on Oct...,https://www.instagram.com/bmwperformancedrivin...,bmwperformancedrivingschools,...,,,,,,,,,,
1,https://www.instagram.com/p/CyipkWZulk0/,The #SpiritofGS was alive last weekend and we’...,2,220,2023-10-18T13:15:03.000Z,False,Video,,https://www.instagram.com/bmwmotorradusa,bmwmotorradusa,...,BMW US Rider Academy,bmwusrideracademy,,,,,,,,
2,https://www.instagram.com/p/CygGbSOuk5_/,Ride in a BMW racecar! Our M Fall Festival eve...,3,202,2023-10-17T13:27:03.000Z,False,Photo,Photo by BMW Performance Driving School on Oct...,https://www.instagram.com/bmwperformancedrivin...,bmwperformancedrivingschools,...,,,,,,,,,,
3,https://www.instagram.com/p/CyWBBrqO_vy/,BMW + the ExtremeContact Sport 02 = Extreme Pe...,3,304,2023-10-13T15:29:16.000Z,False,Video,,https://www.instagram.com/continental_tire,continental_tire,...,,,,,,,,,,
4,https://www.instagram.com/p/CyV4SCJtjFp/,It’s time to celebrate at Thermal! We’re hosti...,3,298,2023-10-13T14:11:03.000Z,False,Photo,Photo by BMW Performance Driving School on Oct...,https://www.instagram.com/bmwperformancedrivin...,bmwperformancedrivingschools,...,,,,,,,,,,


In [17]:
porsche_insta_df.head()

Unnamed: 0,postUrl,description,commentCount,likeCount,location,locationId,pubDate,isSidecar,type,caption,...,taggedFullName12,taggedUsername12,taggedFullName13,taggedUsername13,taggedFullName14,taggedUsername14,taggedFullName15,taggedUsername15,taggedFullName16,taggedUsername16
0,https://www.instagram.com/p/CpubRP2oX3X/,Adrenalinjäger aufgepasst: Am 18. März trifft ...,4,413,Porsche Experience Center Hockenheimring,106880800000000.0,2023-03-13T10:15:59.000Z,False,Photo,Photo by PEC Hockenheimring in Porsche Experie...,...,,,,,,,,,,
1,https://www.instagram.com/p/Ch4jE_4ogCh/,Erfülle Dir Deinen Traum mit unseren Fahrerleb...,0,425,Porsche Experience Center Hockenheimring,106880800000000.0,2022-08-30T12:28:31.000Z,False,Video,,...,,,,,,,,,,
2,https://www.instagram.com/p/Cfd5SLpIRhS/,"Die Rennstrecke ruft! Seid Ihr bereit, Euch de...",2,285,Porsche Experience Center Hockenheimring,106880800000000.0,2022-07-01T10:57:13.000Z,False,Photo,Photo by PEC Hockenheimring in Porsche Experie...,...,,,,,,,,,,
3,https://www.instagram.com/p/CyixwV1CUPf/,Die sicherste Strategie für eine entspannte We...,0,126,,,2023-10-18T14:24:08.000Z,False,Photo,"Photo by PEC Hockenheimring on October 18, 202...",...,,,,,,,,,,
4,https://www.instagram.com/p/Cydwscsr2M3/,Perfektes Wetter und ein spektakuläres Program...,2,234,,,2023-10-16T15:38:39.000Z,True,Photo,"Photo by PEC Hockenheimring on October 16, 202...",...,,,,,,,,,,


In [18]:
def drop_unecessary_columns(df: pd.DataFrame):
    
    column_list = [
        'location','locationId', "isSidecar", "query",
         "videoUrl", "timestamp",'likedByViewer', "imgUrl",
        'fullName', 'videoDuration', "caption", "postId"]
    df = df.drop(columns=column_list, axis=1)
    df = df.loc[:,~df.columns.str.startswith("tagged")]
    return df

def rename_columns(df: pd.DataFrame):
    rename_cols = {
        "description": "postContent",
        "pubDate": "postTimestamp"
    }
    return df.rename(columns=rename_cols)

def change_datatypes(df: pd.DataFrame):
    df["postTimestamp"] = pd.to_datetime(df["postTimestamp"])
    return df

def get_current_year_data(df: pd.DataFrame, year: int):
    return df[df["postTimestamp"] >= f"{year}.01.01"].sort_values("postTimestamp")

In [19]:
def pipeline(df: pd.DataFrame):
    df = drop_unecessary_columns(df=df)
    df = rename_columns(df=df)
    df = change_datatypes(df=df)
    df = get_current_year_data(df=df, year=2023)
    return df

In [20]:
porsche_df = pipeline(df=porsche_insta_df)
bmw_df = pipeline(df=bmw_insta_df)
mercedes_df = pipeline(df=mercedes_insta_df)

## Splitting the text (Porsche)

In [21]:
porsche_df.head()

Unnamed: 0,postUrl,postContent,commentCount,likeCount,postTimestamp,type,profileUrl,username,viewCount,playCount
414,https://www.instagram.com/p/Cm6QO3Uoz-A/,Das neue Jahr hat begonnen und wir können es s...,2,298,2023-01-02 10:56:14+00:00,Photo,https://www.instagram.com/pec.hhr,pec.hhr,,
413,https://www.instagram.com/p/Cm6QO3Uoz-A/,Das neue Jahr hat begonnen und wir können es s...,2,298,2023-01-02 10:56:14+00:00,Photo,https://www.instagram.com/pec.hhr,pec.hhr,,
411,https://www.instagram.com/p/CnAKVbkNMQG/,"Eines der großen Highlights, die 2023 auf Euch...",1,210,2023-01-04 18:00:09+00:00,Photo,https://www.instagram.com/pec.hhr,pec.hhr,,
412,https://www.instagram.com/p/CnAKVbkNMQG/,"Eines der großen Highlights, die 2023 auf Euch...",1,210,2023-01-04 18:00:09+00:00,Photo,https://www.instagram.com/pec.hhr,pec.hhr,,
410,https://www.instagram.com/p/CnAKVbkNMQG/,"Eines der großen Highlights, die 2023 auf Euch...",1,210,2023-01-04 18:00:09+00:00,Photo,https://www.instagram.com/pec.hhr,pec.hhr,,


In [22]:
filtered_content_en = []
filtered_content_de = []
translator = Translator()

for i in range(len(porsche_df)):
    text = porsche_df.postContent.values[i].split("\n")[0]
    filtered_content_de.append(text)
    text = translator.translate(text).text
    filtered_content_en.append(text)

In [23]:
porsche_df["filteredContentEn"] = filtered_content_en
porsche_df["filteredContentDe"] = filtered_content_de

## Saving preprocessed DF

In [25]:
porsche_df.to_csv("./data/porsche_instagram.csv")
# bmw_df.to_csv("./data/bmw_instagram.csv")
# mercedes_df.to_csv("./data/mercedes_instagram.csv")

  values = values.astype(str)
