In [17]:
import pandas as pd
import urllib.parse

# Load the IMDb 'title.basics' dataset
title_basics = pd.read_csv('imdb/title.basics.tsv.gz', sep='\t', dtype=str, na_values='\\N', compression='gzip')


In [18]:
import pandas as pd

# Load the IMDb 'title.basics' dataset
title_basics = pd.read_csv(
    'imdb/title.basics.tsv.gz',  # Adjust the path if necessary
    sep='\t',
    dtype=str,
    na_values='\\N',
    compression='gzip',
    low_memory=False  # Avoid dtype warnings
)

def get_imdb_urls(english_names):
    # Prepare a list to store results
    results = []

    # Convert 'primaryTitle' to lowercase for case-insensitive matching
    title_basics['primaryTitle_lower'] = title_basics['primaryTitle'].str.lower()

    # Filter for TV series to narrow down the search (optional)
    tv_series = title_basics[title_basics['titleType'] == 'tvSeries']

    # Iterate over each English name
    for name in english_names:
        name_lower = name.lower()
        # Search for exact matches in 'primaryTitle_lower'
        matches = tv_series[tv_series['primaryTitle_lower'] == name_lower]

        if not matches.empty:
            # If multiple matches, you might want to handle them accordingly
            # For simplicity, we'll take the first match
            row = matches.iloc[0]
            title_id = row['tconst']
            imdb_url = f"https://www.imdb.com/title/{title_id}/"
            results.append({
                'English Title': name,
                'IMDb URL': imdb_url
            })
        else:
            # Handle cases where no match is found
            results.append({
                'English Title': name,
                'IMDb URL': 'Not Found'
            })

    # Convert results to a DataFrame
    results_df = pd.DataFrame(results)
    return results_df

# Example usage

# Replace the following line with your actual DataFrame
df_turkish_drama = pd.read_csv('turkish_drama_data.csv')

# Extract the list of English names
# Ensure 'English Title' is of string type and handle NaN values
df_turkish_drama['English Title'] = df_turkish_drama['English Title'].astype(str)

# Replace 'nan' strings with empty strings or a placeholder
df_turkish_drama['English Title'].replace('nan', '', inplace=True)

# Extract the list of English names
english_names = df_turkish_drama['English Title'].tolist()
# Get the IMDb URLs
imdb_urls_df = get_imdb_urls(english_names)

# Merge the IMDb URLs back to the original DataFrame
df_turkish_drama = df_turkish_drama.merge(imdb_urls_df, on='English Title', how='left')



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_turkish_drama['English Title'].replace('nan', '', inplace=True)


In [19]:
# look at the updated DataFrame
import re
# Function to extract IMDb ID from URL
def extract_imdb_id(url):
    match = re.search(r'(tt\d+)', url)
    if match:
        return match.group(1)
    else:
        return None

# Apply the function to create a new 'IMDb ID' column
df_turkish_drama['IMDb ID'] = df_turkish_drama['IMDb URL'].apply(extract_imdb_id)


In [20]:
df_turkish_drama.rename(columns={'IMDb URL': 'IMDb URL from English', 'IMDb ID': 'IMDb ID from English'}, inplace=True)
df_turkish_drama.head()

Unnamed: 0,URL,Original Title,English Title,Genre,Episodes,Broadcast Network,Broadcast Start Date,Broadcast End Date,Production Company,Director,Screen Writer,Synopsis,IMDb URL from English,IMDb ID from English
0,https://www.turkishdrama.com/the-tailor-terzi-...,Terzi,The Tailor,"Drama, Romance",23.0,Netflix,2-May-23,3-Nov-23,OGM Pictures,Cem Karci,,Peyami Dokumaci (Cagatay Ulusoy) is a young an...,https://www.imdb.com/title/tt15473010/,tt15473010
1,https://www.turkishdrama.com/sapphire-safir-tv...,Safir,Sapphire,"Romance, Drama",17.0,Atv,4-Sep-23,Present,NTC Medya,Semih Bagci,,"Gulsoy family is a well-known, wealthy family ...",https://www.imdb.com/title/tt17048670/,tt17048670
2,https://www.turkishdrama.com/omer-tv-series.html,Ömer,Omer,"Drama, Family",34.0,Star TV,9-Jan-23,present,OGM Pictures,Cem Karci,"Gulizar Irmak, Deniz Madanoglu, Sedef Bayburtl...",Omer (Selahattin Pasali) is a young guy in his...,Not Found,
3,https://www.turkishdrama.com/forevermore-verme...,Vermem Seni Ellere,Forevermore,"Romance, Drama",9.0,Atv,18-Jun-23,13-Aug-23,AKN Film,Ali Balci,"Sehrazat Tunus Tasci, Damla Gucer, Samed Aslan...",Mehmet (Emre Bey) is a young guy who comes fro...,https://www.imdb.com/title/tt4183480/,tt4183480
4,https://www.turkishdrama.com/queen-kralice-tv-...,Kralice,Queen,"Drama, Romance",11.0,Kanal D,6-Apr-23,7-Jun-23,Mednova,"Cevdet Mercan, Serhan Sahin","Serdar Soydan, Kerem Bozok, Ekin Akcay, Nil Gu...",Deniz (Burcu Ozberk) and Ates (Gokhan Alkan) a...,https://www.imdb.com/title/tt11393148/,tt11393148


In [21]:
# write to csv:
df_turkish_drama.to_csv('turkish_drama_data_with_imdb_eng.csv', index=False)