Set up the word list

In [3]:
import requests

#url = 'https://github.com/tabatkins/wordle-list/blob/main/words'
url = 'https://raw.githubusercontent.com/tabatkins/wordle-list/main/words'
response = requests.get(url)

wordle_words = response.text.strip().split('\n')
print("Sample words:", wordle_words[:5])
print("Total words:", len(wordle_words))

Sample words: ['rossa', 'jetty', 'wizzo', 'cuppa', 'cohoe']
Total words: 14855


In [4]:
#Get word frequency

In [7]:
!pip install wordfreq
from wordfreq import word_frequency

Collecting wordfreq
  Downloading wordfreq-3.1.1-py3-none-any.whl.metadata (27 kB)
Collecting ftfy>=6.1 (from wordfreq)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting locate<2.0.0,>=1.1.1 (from wordfreq)
  Downloading locate-1.1.1-py3-none-any.whl.metadata (3.9 kB)
Downloading wordfreq-3.1.1-py3-none-any.whl (56.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.8/56.8 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading locate-1.1.1-py3-none-any.whl (5.4 kB)
Installing collected packages: locate, ftfy, wordfreq
Successfully installed ftfy-6.3.1 locate-1.1.1 wordfreq-3.1.1


In [9]:
#frequency -  between 0 and 1; higher = more common

word_freqs = [word_frequency(word, 'en') for word in wordle_words]
print("Sample frequencies:", word_freqs[:5])




Sample frequencies: [1.82e-07, 1.1e-06, 0.0, 5.01e-07, 0.0]


In [11]:
#creating a dataframe
import pandas as pd

df = pd.DataFrame({'word': wordle_words, 'frequency': word_freqs})
df.head()

Unnamed: 0,word,frequency
0,rossa,1.82e-07
1,jetty,1.1e-06
2,wizzo,0.0
3,cuppa,5.01e-07
4,cohoe,0.0


In [13]:
#adding more features

def is_vowel(v):
  return v in 'aeiou'

def extract_features(word):
  vowels = sum(1 for v in word if is_vowel(v))
  rare_letters = sum(1 for v in word if v in 'qzxjk')
  has_duplicates = len(set(word)) < len(word)

  return pd.Series([vowels, rare_letters, has_duplicates])

df[["vowel_count", "rare_letter_count", "has_duplicates"]] = df["word"].apply(extract_features)
df.head()

Unnamed: 0,word,frequency,vowel_count,rare_letter_count,has_duplicates
0,rossa,1.82e-07,2,0,True
1,jetty,1.1e-06,1,1,True
2,wizzo,0.0,2,2,True
3,cuppa,5.01e-07,2,0,True
4,cohoe,0.0,3,0,True


In [15]:
avg_freq = df['frequency'].mean()
print("Average frequency",avg_freq)

min_freq = df['frequency'].min()
print("Min frequency",min_freq)

max_freq = df['frequency'].max()
print("Max frequency",max_freq)

Average frequency 6.854923789969707e-06
Min frequency 0.0
Max frequency 0.00251


In [17]:
#define difficulty levels- temp target variables

def difficulty_level(row):
  if row['frequency'] > 0.0001 and row["rare_letter_count"] == 0:
    return 'Easy'
  elif row['frequency'] <0.00001 or row["rare_letter_count"] >= 2:
    return "Hard"
  else:
    return "Medium"

df['difficulty'] = df.apply(difficulty_level, axis=1)
df['difficulty'].value_counts()

Unnamed: 0_level_0,count
difficulty,Unnamed: 1_level_1
Hard,13805
Medium,856
Easy,194


In [18]:
#save the file as csv
df.to_csv('wordle_difficulty_data.csv', index=False)